From ef990761580e046c1f990d438c29d5279df74874 Mon Sep 17 00:00:00 2001 From: Maximus Date: Fri, 6 Dec 2019 12:34:08 -0800 Subject: [PATCH] Update DLP samples to use dlp_v2 client. (#2580) --- dlp/deid.py | 538 +++++++++-------- dlp/deid_test.py | 107 ++-- dlp/inspect_content.py | 1126 +++++++++++++++++++++-------------- dlp/inspect_content_test.py | 236 ++++---- dlp/jobs.py | 78 +-- dlp/jobs_test.py | 45 +- dlp/metadata.py | 29 +- dlp/metadata_test.py | 2 +- dlp/quickstart.py | 40 +- dlp/quickstart_test.py | 13 +- dlp/redact.py | 118 ++-- dlp/redact_test.py | 16 +- dlp/risk.py | 681 ++++++++++++--------- dlp/risk_test.py | 135 +++-- dlp/templates.py | 185 +++--- dlp/templates_test.py | 10 +- dlp/triggers.py | 206 ++++--- dlp/triggers_test.py | 26 +- 18 files changed, 2029 insertions(+), 1562 deletions(-) diff --git a/dlp/deid.py b/dlp/deid.py index c73631a3fc87..423e0c26c64c 100644 --- a/dlp/deid.py +++ b/dlp/deid.py @@ -20,8 +20,9 @@ # [START dlp_deidentify_masking] -def deidentify_with_mask(project, string, info_types, masking_character=None, - number_to_mask=0): +def deidentify_with_mask( + project, string, info_types, masking_character=None, number_to_mask=0 +): """Uses the Data Loss Prevention API to deidentify sensitive data in a string by masking it with a character. Args: @@ -39,25 +40,23 @@ def deidentify_with_mask(project, string, info_types, masking_character=None, import google.cloud.dlp # Instantiate a client - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Construct inspect configuration dictionary - inspect_config = { - 'info_types': [{'name': info_type} for info_type in info_types] - } + inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]} # Construct deidentify configuration dictionary deidentify_config = { - 'info_type_transformations': { - 'transformations': [ + "info_type_transformations": { + "transformations": [ { - 'primitive_transformation': { - 'character_mask_config': { - 'masking_character': masking_character, - 'number_to_mask': number_to_mask + "primitive_transformation": { + "character_mask_config": { + "masking_character": masking_character, + "number_to_mask": number_to_mask, } } } @@ -66,21 +65,33 @@ def deidentify_with_mask(project, string, info_types, masking_character=None, } # Construct item - item = {'value': string} + item = {"value": string} # Call the API response = dlp.deidentify_content( - parent, inspect_config=inspect_config, - deidentify_config=deidentify_config, item=item) + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item=item, + ) # Print out the results. print(response.item.value) + + # [END dlp_deidentify_masking] # [START dlp_deidentify_fpe] -def deidentify_with_fpe(project, string, info_types, alphabet=None, - surrogate_type=None, key_name=None, wrapped_key=None): +def deidentify_with_fpe( + project, + string, + info_types, + alphabet=None, + surrogate_type=None, + key_name=None, + wrapped_key=None, +): """Uses the Data Loss Prevention API to deidentify sensitive data in a string using Format Preserving Encryption (FPE). Args: @@ -106,7 +117,7 @@ def deidentify_with_fpe(project, string, info_types, alphabet=None, import google.cloud.dlp # Instantiate a client - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) @@ -114,38 +125,31 @@ def deidentify_with_fpe(project, string, info_types, alphabet=None, # The wrapped key is base64-encoded, but the library expects a binary # string, so decode it here. import base64 + wrapped_key = base64.b64decode(wrapped_key) # Construct FPE configuration dictionary crypto_replace_ffx_fpe_config = { - 'crypto_key': { - 'kms_wrapped': { - 'wrapped_key': wrapped_key, - 'crypto_key_name': key_name - } + "crypto_key": { + "kms_wrapped": {"wrapped_key": wrapped_key, "crypto_key_name": key_name} }, - 'common_alphabet': alphabet + "common_alphabet": alphabet, } # Add surrogate type if surrogate_type: - crypto_replace_ffx_fpe_config['surrogate_info_type'] = { - 'name': surrogate_type - } + crypto_replace_ffx_fpe_config["surrogate_info_type"] = {"name": surrogate_type} # Construct inspect configuration dictionary - inspect_config = { - 'info_types': [{'name': info_type} for info_type in info_types] - } + inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]} # Construct deidentify configuration dictionary deidentify_config = { - 'info_type_transformations': { - 'transformations': [ + "info_type_transformations": { + "transformations": [ { - 'primitive_transformation': { - 'crypto_replace_ffx_fpe_config': - crypto_replace_ffx_fpe_config + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": crypto_replace_ffx_fpe_config } } ] @@ -153,21 +157,27 @@ def deidentify_with_fpe(project, string, info_types, alphabet=None, } # Convert string to item - item = {'value': string} + item = {"value": string} # Call the API response = dlp.deidentify_content( - parent, inspect_config=inspect_config, - deidentify_config=deidentify_config, item=item) + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item=item, + ) # Print results print(response.item.value) + + # [END dlp_deidentify_fpe] # [START dlp_reidentify_fpe] -def reidentify_with_fpe(project, string, alphabet=None, - surrogate_type=None, key_name=None, wrapped_key=None): +def reidentify_with_fpe( + project, string, alphabet=None, surrogate_type=None, key_name=None, wrapped_key=None +): """Uses the Data Loss Prevention API to reidentify sensitive data in a string that was encrypted by Format Preserving Encryption (FPE). Args: @@ -191,7 +201,7 @@ def reidentify_with_fpe(project, string, alphabet=None, import google.cloud.dlp # Instantiate a client - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) @@ -199,25 +209,24 @@ def reidentify_with_fpe(project, string, alphabet=None, # The wrapped key is base64-encoded, but the library expects a binary # string, so decode it here. import base64 + wrapped_key = base64.b64decode(wrapped_key) # Construct Deidentify Config reidentify_config = { - 'info_type_transformations': { - 'transformations': [ + "info_type_transformations": { + "transformations": [ { - 'primitive_transformation': { - 'crypto_replace_ffx_fpe_config': { - 'crypto_key': { - 'kms_wrapped': { - 'wrapped_key': wrapped_key, - 'crypto_key_name': key_name + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": { + "crypto_key": { + "kms_wrapped": { + "wrapped_key": wrapped_key, + "crypto_key_name": key_name, } }, - 'common_alphabet': alphabet, - 'surrogate_info_type': { - 'name': surrogate_type - } + "common_alphabet": alphabet, + "surrogate_info_type": {"name": surrogate_type}, } } } @@ -226,38 +235,41 @@ def reidentify_with_fpe(project, string, alphabet=None, } inspect_config = { - 'custom_info_types': [ - { - 'info_type': { - 'name': surrogate_type - }, - 'surrogate_type': { - } - } + "custom_info_types": [ + {"info_type": {"name": surrogate_type}, "surrogate_type": {}} ] } # Convert string to item - item = {'value': string} + item = {"value": string} # Call the API response = dlp.reidentify_content( parent, inspect_config=inspect_config, reidentify_config=reidentify_config, - item=item) + item=item, + ) # Print results print(response.item.value) + + # [END dlp_reidentify_fpe] # [START dlp_deidentify_date_shift] -def deidentify_with_date_shift(project, input_csv_file=None, - output_csv_file=None, date_fields=None, - lower_bound_days=None, upper_bound_days=None, - context_field_id=None, wrapped_key=None, - key_name=None): +def deidentify_with_date_shift( + project, + input_csv_file=None, + output_csv_file=None, + date_fields=None, + lower_bound_days=None, + upper_bound_days=None, + context_field_id=None, + wrapped_key=None, + key_name=None, +): """Uses the Data Loss Prevention API to deidentify dates in a CSV file by pseudorandomly shifting them. Args: @@ -289,14 +301,14 @@ def deidentify_with_date_shift(project, input_csv_file=None, import google.cloud.dlp # Instantiate a client - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Convert date field list to Protobuf type def map_fields(field): - return {'name': field} + return {"name": field} if date_fields: date_fields = map(map_fields, date_fields) @@ -306,31 +318,28 @@ def map_fields(field): # Read and parse the CSV file import csv from datetime import datetime + f = [] - with open(input_csv_file, 'r') as csvfile: + with open(input_csv_file, "r") as csvfile: reader = csv.reader(csvfile) for row in reader: f.append(row) # Helper function for converting CSV rows to Protobuf types def map_headers(header): - return {'name': header} + return {"name": header} def map_data(value): try: - date = datetime.strptime(value, '%m/%d/%Y') + date = datetime.strptime(value, "%m/%d/%Y") return { - 'date_value': { - 'year': date.year, - 'month': date.month, - 'day': date.day - } + "date_value": {"year": date.year, "month": date.month, "day": date.day} } except ValueError: - return {'string_value': value} + return {"string_value": value} def map_rows(row): - return {'values': map(map_data, row)} + return {"values": map(map_data, row)} # Using the helper functions, convert CSV rows to protobuf-compatible # dictionaries. @@ -338,17 +347,12 @@ def map_rows(row): csv_rows = map(map_rows, f[1:]) # Construct the table dict - table_item = { - 'table': { - 'headers': csv_headers, - 'rows': csv_rows - } - } + table_item = {"table": {"headers": csv_headers, "rows": csv_rows}} # Construct date shift config date_shift_config = { - 'lower_bound_days': lower_bound_days, - 'upper_bound_days': upper_bound_days + "lower_bound_days": lower_bound_days, + "upper_bound_days": upper_bound_days, } # If using a Cloud KMS key, add it to the date_shift_config. @@ -356,26 +360,29 @@ def map_rows(row): # string, so decode it here. if context_field_id and key_name and wrapped_key: import base64 - date_shift_config['context'] = {'name': context_field_id} - date_shift_config['crypto_key'] = { - 'kms_wrapped': { - 'wrapped_key': base64.b64decode(wrapped_key), - 'crypto_key_name': key_name + + date_shift_config["context"] = {"name": context_field_id} + date_shift_config["crypto_key"] = { + "kms_wrapped": { + "wrapped_key": base64.b64decode(wrapped_key), + "crypto_key_name": key_name, } } elif context_field_id or key_name or wrapped_key: - raise ValueError("""You must set either ALL or NONE of - [context_field_id, key_name, wrapped_key]!""") + raise ValueError( + """You must set either ALL or NONE of + [context_field_id, key_name, wrapped_key]!""" + ) # Construct Deidentify Config deidentify_config = { - 'record_transformations': { - 'field_transformations': [ + "record_transformations": { + "field_transformations": [ { - 'fields': date_fields, - 'primitive_transformation': { - 'date_shift_config': date_shift_config - } + "fields": date_fields, + "primitive_transformation": { + "date_shift_config": date_shift_config + }, } ] } @@ -386,199 +393,252 @@ def write_header(header): return header.name def write_data(data): - return data.string_value or '%s/%s/%s' % (data.date_value.month, - data.date_value.day, - data.date_value.year) + return data.string_value or "%s/%s/%s" % ( + data.date_value.month, + data.date_value.day, + data.date_value.year, + ) # Call the API response = dlp.deidentify_content( - parent, deidentify_config=deidentify_config, item=table_item) + parent, deidentify_config=deidentify_config, item=table_item + ) # Write results to CSV file - with open(output_csv_file, 'w') as csvfile: - write_file = csv.writer(csvfile, delimiter=',') + with open(output_csv_file, "w") as csvfile: + write_file = csv.writer(csvfile, delimiter=",") write_file.writerow(map(write_header, response.item.table.headers)) for row in response.item.table.rows: write_file.writerow(map(write_data, row.values)) # Print status - print('Successfully saved date-shift output to {}'.format( - output_csv_file)) + print("Successfully saved date-shift output to {}".format(output_csv_file)) + + # [END dlp_deidentify_date_shift] -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) subparsers = parser.add_subparsers( - dest='content', help='Select how to submit content to the API.') + dest="content", help="Select how to submit content to the API." + ) subparsers.required = True mask_parser = subparsers.add_parser( - 'deid_mask', - help='Deidentify sensitive data in a string by masking it with a ' - 'character.') + "deid_mask", + help="Deidentify sensitive data in a string by masking it with a " "character.", + ) mask_parser.add_argument( - '--info_types', nargs='+', - help='Strings representing info types to look for. A full list of ' - 'info categories and types is available from the API. Examples ' - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - 'If unspecified, the three above examples will be used.', - default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) mask_parser.add_argument( - 'project', - help='The Google Cloud project id to use as a parent resource.') - mask_parser.add_argument('item', help='The string to deidentify.') + "project", help="The Google Cloud project id to use as a parent resource." + ) + mask_parser.add_argument("item", help="The string to deidentify.") mask_parser.add_argument( - '-n', '--number_to_mask', + "-n", + "--number_to_mask", type=int, default=0, - help='The maximum number of sensitive characters to mask in a match. ' - 'If omitted the request or set to 0, the API will mask any mathcing ' - 'characters.') + help="The maximum number of sensitive characters to mask in a match. " + "If omitted the request or set to 0, the API will mask any mathcing " + "characters.", + ) mask_parser.add_argument( - '-m', '--masking_character', - help='The character to mask matching sensitive data with.') + "-m", + "--masking_character", + help="The character to mask matching sensitive data with.", + ) fpe_parser = subparsers.add_parser( - 'deid_fpe', - help='Deidentify sensitive data in a string using Format Preserving ' - 'Encryption (FPE).') + "deid_fpe", + help="Deidentify sensitive data in a string using Format Preserving " + "Encryption (FPE).", + ) fpe_parser.add_argument( - '--info_types', action='append', - help='Strings representing info types to look for. A full list of ' - 'info categories and types is available from the API. Examples ' - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - 'If unspecified, the three above examples will be used.', - default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) fpe_parser.add_argument( - 'project', - help='The Google Cloud project id to use as a parent resource.') + "project", help="The Google Cloud project id to use as a parent resource." + ) fpe_parser.add_argument( - 'item', - help='The string to deidentify. ' - 'Example: string = \'My SSN is 372819127\'') + "item", + help="The string to deidentify. " "Example: string = 'My SSN is 372819127'", + ) fpe_parser.add_argument( - 'key_name', - help='The name of the Cloud KMS key used to encrypt (\'wrap\') the ' - 'AES-256 key. Example: ' - 'key_name = \'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/' - 'keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME\'') + "key_name", + help="The name of the Cloud KMS key used to encrypt ('wrap') the " + "AES-256 key. Example: " + "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" + "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", + ) fpe_parser.add_argument( - 'wrapped_key', - help='The encrypted (\'wrapped\') AES-256 key to use. This key should ' - 'be encrypted using the Cloud KMS key specified by key_name.') + "wrapped_key", + help="The encrypted ('wrapped') AES-256 key to use. This key should " + "be encrypted using the Cloud KMS key specified by key_name.", + ) fpe_parser.add_argument( - '-a', '--alphabet', default='ALPHA_NUMERIC', - help='The set of characters to replace sensitive ones with. Commonly ' + "-a", + "--alphabet", + default="ALPHA_NUMERIC", + help="The set of characters to replace sensitive ones with. Commonly " 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", ' '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", ' - '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"') + '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"', + ) fpe_parser.add_argument( - '-s', '--surrogate_type', - help='The name of the surrogate custom info type to use. Only ' - 'necessary if you want to reverse the deidentification process. Can ' - 'be essentially any arbitrary string, as long as it doesn\'t appear ' - 'in your dataset otherwise.') + "-s", + "--surrogate_type", + help="The name of the surrogate custom info type to use. Only " + "necessary if you want to reverse the deidentification process. Can " + "be essentially any arbitrary string, as long as it doesn't appear " + "in your dataset otherwise.", + ) reid_parser = subparsers.add_parser( - 'reid_fpe', - help='Reidentify sensitive data in a string using Format Preserving ' - 'Encryption (FPE).') + "reid_fpe", + help="Reidentify sensitive data in a string using Format Preserving " + "Encryption (FPE).", + ) reid_parser.add_argument( - 'project', - help='The Google Cloud project id to use as a parent resource.') + "project", help="The Google Cloud project id to use as a parent resource." + ) reid_parser.add_argument( - 'item', - help='The string to deidentify. ' - 'Example: string = \'My SSN is 372819127\'') + "item", + help="The string to deidentify. " "Example: string = 'My SSN is 372819127'", + ) reid_parser.add_argument( - 'surrogate_type', - help='The name of the surrogate custom info type to use. Only ' - 'necessary if you want to reverse the deidentification process. Can ' - 'be essentially any arbitrary string, as long as it doesn\'t appear ' - 'in your dataset otherwise.') + "surrogate_type", + help="The name of the surrogate custom info type to use. Only " + "necessary if you want to reverse the deidentification process. Can " + "be essentially any arbitrary string, as long as it doesn't appear " + "in your dataset otherwise.", + ) reid_parser.add_argument( - 'key_name', - help='The name of the Cloud KMS key used to encrypt (\'wrap\') the ' - 'AES-256 key. Example: ' - 'key_name = \'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/' - 'keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME\'') + "key_name", + help="The name of the Cloud KMS key used to encrypt ('wrap') the " + "AES-256 key. Example: " + "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" + "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", + ) reid_parser.add_argument( - 'wrapped_key', - help='The encrypted (\'wrapped\') AES-256 key to use. This key should ' - 'be encrypted using the Cloud KMS key specified by key_name.') + "wrapped_key", + help="The encrypted ('wrapped') AES-256 key to use. This key should " + "be encrypted using the Cloud KMS key specified by key_name.", + ) reid_parser.add_argument( - '-a', '--alphabet', default='ALPHA_NUMERIC', - help='The set of characters to replace sensitive ones with. Commonly ' + "-a", + "--alphabet", + default="ALPHA_NUMERIC", + help="The set of characters to replace sensitive ones with. Commonly " 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", ' '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", ' - '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"') + '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"', + ) date_shift_parser = subparsers.add_parser( - 'deid_date_shift', - help='Deidentify dates in a CSV file by pseudorandomly shifting them.') + "deid_date_shift", + help="Deidentify dates in a CSV file by pseudorandomly shifting them.", + ) date_shift_parser.add_argument( - 'project', - help='The Google Cloud project id to use as a parent resource.') + "project", help="The Google Cloud project id to use as a parent resource." + ) date_shift_parser.add_argument( - 'input_csv_file', - help='The path to the CSV file to deidentify. The first row of the ' - 'file must specify column names, and all other rows must contain ' - 'valid values.') + "input_csv_file", + help="The path to the CSV file to deidentify. The first row of the " + "file must specify column names, and all other rows must contain " + "valid values.", + ) date_shift_parser.add_argument( - 'output_csv_file', - help='The path to save the date-shifted CSV file.') + "output_csv_file", help="The path to save the date-shifted CSV file." + ) date_shift_parser.add_argument( - 'lower_bound_days', type=int, - help='The maximum number of days to shift a date backward') + "lower_bound_days", + type=int, + help="The maximum number of days to shift a date backward", + ) date_shift_parser.add_argument( - 'upper_bound_days', type=int, - help='The maximum number of days to shift a date forward') + "upper_bound_days", + type=int, + help="The maximum number of days to shift a date forward", + ) date_shift_parser.add_argument( - 'date_fields', nargs='+', - help='The list of date fields in the CSV file to date shift. Example: ' - '[\'birth_date\', \'register_date\']') + "date_fields", + nargs="+", + help="The list of date fields in the CSV file to date shift. Example: " + "['birth_date', 'register_date']", + ) date_shift_parser.add_argument( - '--context_field_id', - help='(Optional) The column to determine date shift amount based on. ' - 'If this is not specified, a random shift amount will be used for ' - 'every row. If this is specified, then \'wrappedKey\' and \'keyName\' ' - 'must also be set.') + "--context_field_id", + help="(Optional) The column to determine date shift amount based on. " + "If this is not specified, a random shift amount will be used for " + "every row. If this is specified, then 'wrappedKey' and 'keyName' " + "must also be set.", + ) date_shift_parser.add_argument( - '--key_name', - help='(Optional) The name of the Cloud KMS key used to encrypt ' - '(\'wrap\') the AES-256 key. Example: ' - 'key_name = \'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/' - 'keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME\'') + "--key_name", + help="(Optional) The name of the Cloud KMS key used to encrypt " + "('wrap') the AES-256 key. Example: " + "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" + "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", + ) date_shift_parser.add_argument( - '--wrapped_key', - help='(Optional) The encrypted (\'wrapped\') AES-256 key to use. This ' - 'key should be encrypted using the Cloud KMS key specified by' - 'key_name.') + "--wrapped_key", + help="(Optional) The encrypted ('wrapped') AES-256 key to use. This " + "key should be encrypted using the Cloud KMS key specified by" + "key_name.", + ) args = parser.parse_args() - if args.content == 'deid_mask': - deidentify_with_mask(args.project, args.item, args.info_types, - masking_character=args.masking_character, - number_to_mask=args.number_to_mask) - elif args.content == 'deid_fpe': - deidentify_with_fpe(args.project, args.item, args.info_types, - alphabet=args.alphabet, - wrapped_key=args.wrapped_key, - key_name=args.key_name, - surrogate_type=args.surrogate_type) - elif args.content == 'reid_fpe': - reidentify_with_fpe(args.project, args.item, - surrogate_type=args.surrogate_type, - wrapped_key=args.wrapped_key, - key_name=args.key_name, alphabet=args.alphabet) - elif args.content == 'deid_date_shift': - deidentify_with_date_shift(args.project, - input_csv_file=args.input_csv_file, - output_csv_file=args.output_csv_file, - lower_bound_days=args.lower_bound_days, - upper_bound_days=args.upper_bound_days, - date_fields=args.date_fields, - context_field_id=args.context_field_id, - wrapped_key=args.wrapped_key, - key_name=args.key_name) + if args.content == "deid_mask": + deidentify_with_mask( + args.project, + args.item, + args.info_types, + masking_character=args.masking_character, + number_to_mask=args.number_to_mask, + ) + elif args.content == "deid_fpe": + deidentify_with_fpe( + args.project, + args.item, + args.info_types, + alphabet=args.alphabet, + wrapped_key=args.wrapped_key, + key_name=args.key_name, + surrogate_type=args.surrogate_type, + ) + elif args.content == "reid_fpe": + reidentify_with_fpe( + args.project, + args.item, + surrogate_type=args.surrogate_type, + wrapped_key=args.wrapped_key, + key_name=args.key_name, + alphabet=args.alphabet, + ) + elif args.content == "deid_date_shift": + deidentify_with_date_shift( + args.project, + input_csv_file=args.input_csv_file, + output_csv_file=args.output_csv_file, + lower_bound_days=args.lower_bound_days, + upper_bound_days=args.upper_bound_days, + date_fields=args.date_fields, + context_field_id=args.context_field_id, + wrapped_key=args.wrapped_key, + key_name=args.key_name, + ) diff --git a/dlp/deid_test.py b/dlp/deid_test.py index e381f4502ef5..df9dae418e6e 100644 --- a/dlp/deid_test.py +++ b/dlp/deid_test.py @@ -20,22 +20,26 @@ import deid -HARMFUL_STRING = 'My SSN is 372819127' -HARMLESS_STRING = 'My favorite color is blue' -GCLOUD_PROJECT = os.getenv('GCLOUD_PROJECT') -WRAPPED_KEY = ('CiQAz0hX4+go8fJwn80Fr8pVImwx+tmZdqU7JL+7TN/S5JxBU9gSSQDhFHpFVy' - 'uzJps0YH9ls480mU+JLG7jI/0lL04i6XJRWqmI6gUSZRUtECYcLH5gXK4SXHlL' - 'rotx7Chxz/4z7SIpXFOBY61z0/U=') -KEY_NAME = ('projects/python-docs-samples-tests/locations/global/keyRings/' - 'dlp-test/cryptoKeys/dlp-test') -SURROGATE_TYPE = 'SSN_TOKEN' -CSV_FILE = os.path.join(os.path.dirname(__file__), 'resources/dates.csv') +HARMFUL_STRING = "My SSN is 372819127" +HARMLESS_STRING = "My favorite color is blue" +GCLOUD_PROJECT = os.getenv("GCLOUD_PROJECT") +WRAPPED_KEY = ( + "CiQAz0hX4+go8fJwn80Fr8pVImwx+tmZdqU7JL+7TN/S5JxBU9gSSQDhFHpFVy" + "uzJps0YH9ls480mU+JLG7jI/0lL04i6XJRWqmI6gUSZRUtECYcLH5gXK4SXHlL" + "rotx7Chxz/4z7SIpXFOBY61z0/U=" +) +KEY_NAME = ( + "projects/python-docs-samples-tests/locations/global/keyRings/" + "dlp-test/cryptoKeys/dlp-test" +) +SURROGATE_TYPE = "SSN_TOKEN" +CSV_FILE = os.path.join(os.path.dirname(__file__), "resources/dates.csv") DATE_SHIFTED_AMOUNT = 30 -DATE_FIELDS = ['birth_date', 'register_date'] -CSV_CONTEXT_FIELD = 'name' +DATE_FIELDS = ["birth_date", "register_date"] +CSV_CONTEXT_FIELD = "name" -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def tempdir(): tempdir = tempfile.mkdtemp() yield tempdir @@ -43,16 +47,18 @@ def tempdir(): def test_deidentify_with_mask(capsys): - deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING, - ['US_SOCIAL_SECURITY_NUMBER']) + deid.deidentify_with_mask( + GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"] + ) out, _ = capsys.readouterr() - assert 'My SSN is *********' in out + assert "My SSN is *********" in out def test_deidentify_with_mask_ignore_insensitive_data(capsys): - deid.deidentify_with_mask(GCLOUD_PROJECT, HARMLESS_STRING, - ['US_SOCIAL_SECURITY_NUMBER']) + deid.deidentify_with_mask( + GCLOUD_PROJECT, HARMLESS_STRING, ["US_SOCIAL_SECURITY_NUMBER"] + ) out, _ = capsys.readouterr() assert HARMLESS_STRING in out @@ -62,66 +68,70 @@ def test_deidentify_with_mask_masking_character_specified(capsys): deid.deidentify_with_mask( GCLOUD_PROJECT, HARMFUL_STRING, - ['US_SOCIAL_SECURITY_NUMBER'], - masking_character='#') + ["US_SOCIAL_SECURITY_NUMBER"], + masking_character="#", + ) out, _ = capsys.readouterr() - assert 'My SSN is #########' in out + assert "My SSN is #########" in out def test_deidentify_with_mask_masking_number_specified(capsys): - deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING, - ['US_SOCIAL_SECURITY_NUMBER'], - number_to_mask=7) + deid.deidentify_with_mask( + GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"], number_to_mask=7 + ) out, _ = capsys.readouterr() - assert 'My SSN is *******27' in out + assert "My SSN is *******27" in out def test_deidentify_with_fpe(capsys): deid.deidentify_with_fpe( GCLOUD_PROJECT, HARMFUL_STRING, - ['US_SOCIAL_SECURITY_NUMBER'], - alphabet='NUMERIC', + ["US_SOCIAL_SECURITY_NUMBER"], + alphabet="NUMERIC", wrapped_key=WRAPPED_KEY, - key_name=KEY_NAME) + key_name=KEY_NAME, + ) out, _ = capsys.readouterr() - assert 'My SSN is' in out - assert '372819127' not in out + assert "My SSN is" in out + assert "372819127" not in out def test_deidentify_with_fpe_uses_surrogate_info_types(capsys): deid.deidentify_with_fpe( GCLOUD_PROJECT, HARMFUL_STRING, - ['US_SOCIAL_SECURITY_NUMBER'], - alphabet='NUMERIC', + ["US_SOCIAL_SECURITY_NUMBER"], + alphabet="NUMERIC", wrapped_key=WRAPPED_KEY, key_name=KEY_NAME, - surrogate_type=SURROGATE_TYPE) + surrogate_type=SURROGATE_TYPE, + ) out, _ = capsys.readouterr() - assert 'My SSN is SSN_TOKEN' in out - assert '372819127' not in out + assert "My SSN is SSN_TOKEN" in out + assert "372819127" not in out def test_deidentify_with_fpe_ignores_insensitive_data(capsys): deid.deidentify_with_fpe( GCLOUD_PROJECT, HARMLESS_STRING, - ['US_SOCIAL_SECURITY_NUMBER'], - alphabet='NUMERIC', + ["US_SOCIAL_SECURITY_NUMBER"], + alphabet="NUMERIC", wrapped_key=WRAPPED_KEY, - key_name=KEY_NAME) + key_name=KEY_NAME, + ) out, _ = capsys.readouterr() assert HARMLESS_STRING in out def test_deidentify_with_date_shift(tempdir, capsys): - output_filepath = os.path.join(tempdir, 'dates-shifted.csv') + output_filepath = os.path.join(tempdir, "dates-shifted.csv") deid.deidentify_with_date_shift( GCLOUD_PROJECT, @@ -129,15 +139,16 @@ def test_deidentify_with_date_shift(tempdir, capsys): output_csv_file=output_filepath, lower_bound_days=DATE_SHIFTED_AMOUNT, upper_bound_days=DATE_SHIFTED_AMOUNT, - date_fields=DATE_FIELDS) + date_fields=DATE_FIELDS, + ) out, _ = capsys.readouterr() - assert 'Successful' in out + assert "Successful" in out def test_deidentify_with_date_shift_using_context_field(tempdir, capsys): - output_filepath = os.path.join(tempdir, 'dates-shifted.csv') + output_filepath = os.path.join(tempdir, "dates-shifted.csv") deid.deidentify_with_date_shift( GCLOUD_PROJECT, @@ -148,15 +159,16 @@ def test_deidentify_with_date_shift_using_context_field(tempdir, capsys): date_fields=DATE_FIELDS, context_field_id=CSV_CONTEXT_FIELD, wrapped_key=WRAPPED_KEY, - key_name=KEY_NAME) + key_name=KEY_NAME, + ) out, _ = capsys.readouterr() - assert 'Successful' in out + assert "Successful" in out def test_reidentify_with_fpe(capsys): - labeled_fpe_string = 'My SSN is SSN_TOKEN(9):731997681' + labeled_fpe_string = "My SSN is SSN_TOKEN(9):731997681" deid.reidentify_with_fpe( GCLOUD_PROJECT, @@ -164,8 +176,9 @@ def test_reidentify_with_fpe(capsys): surrogate_type=SURROGATE_TYPE, wrapped_key=WRAPPED_KEY, key_name=KEY_NAME, - alphabet='NUMERIC') + alphabet="NUMERIC", + ) out, _ = capsys.readouterr() - assert '731997681' not in out + assert "731997681" not in out diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index 6d93241bb9ac..0c151bf64e77 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -23,9 +23,16 @@ # [START dlp_inspect_string] -def inspect_string(project, content_string, info_types, - custom_dictionaries=None, custom_regexes=None, - min_likelihood=None, max_findings=None, include_quote=True): +def inspect_string( + project, + content_string, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + include_quote=True, +): """Uses the Data Loss Prevention API to analyze strings for protected data. Args: project: The Google Cloud project id to use as a parent resource. @@ -46,42 +53,46 @@ def inspect_string(project, content_string, info_types, import google.cloud.dlp # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). - info_types = [{'name': info_type} for info_type in info_types] + info_types = [{"name": info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] - dictionaries = [{ - 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, - 'dictionary': { - 'word_list': {'words': custom_dict.split(',')} + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, } - } for i, custom_dict in enumerate(custom_dictionaries)] + for i, custom_dict in enumerate(custom_dictionaries) + ] if custom_regexes is None: custom_regexes = [] - regexes = [{ - 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regex} - } for i, custom_regex in enumerate(custom_regexes)] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { - 'info_types': info_types, - 'custom_info_types': custom_info_types, - 'min_likelihood': min_likelihood, - 'include_quote': include_quote, - 'limits': {'max_findings_per_request': max_findings}, + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, } # Construct the `item`. - item = {'value': content_string} + item = {"value": content_string} # Convert the project id into a full resource id. parent = dlp.project_path(project) @@ -94,21 +105,30 @@ def inspect_string(project, content_string, info_types, for finding in response.result.findings: try: if finding.quote: - print('Quote: {}'.format(finding.quote)) + print("Quote: {}".format(finding.quote)) except AttributeError: pass - print('Info type: {}'.format(finding.info_type.name)) - print('Likelihood: {}'.format(finding.likelihood)) + print("Info type: {}".format(finding.info_type.name)) + print("Likelihood: {}".format(finding.likelihood)) else: - print('No findings.') + print("No findings.") + + # [END dlp_inspect_string] # [START dlp_inspect_table] -def inspect_table(project, data, info_types, - custom_dictionaries=None, custom_regexes=None, - min_likelihood=None, max_findings=None, include_quote=True): +def inspect_table( + project, + data, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + include_quote=True, +): """Uses the Data Loss Prevention API to analyze strings for protected data. Args: project: The Google Cloud project id to use as a parent resource. @@ -157,38 +177,42 @@ def inspect_table(project, data, info_types, import google.cloud.dlp # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). - info_types = [{'name': info_type} for info_type in info_types] + info_types = [{"name": info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] - dictionaries = [{ - 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, - 'dictionary': { - 'word_list': {'words': custom_dict.split(',')} + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, } - } for i, custom_dict in enumerate(custom_dictionaries)] + for i, custom_dict in enumerate(custom_dictionaries) + ] if custom_regexes is None: custom_regexes = [] - regexes = [{ - 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regex} - } for i, custom_regex in enumerate(custom_regexes)] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { - 'info_types': info_types, - 'custom_info_types': custom_info_types, - 'min_likelihood': min_likelihood, - 'include_quote': include_quote, - 'limits': {'max_findings_per_request': max_findings}, + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, } # Construct the `table`. For more details on the table schema, please see @@ -196,9 +220,7 @@ def inspect_table(project, data, info_types, headers = [{"name": val} for val in data["header"]] rows = [] for row in data["rows"]: - rows.append({ - "values": [{"string_value": cell_val} for cell_val in row] - }) + rows.append({"values": [{"string_value": cell_val} for cell_val in row]}) table = {} table["headers"] = headers @@ -215,21 +237,31 @@ def inspect_table(project, data, info_types, for finding in response.result.findings: try: if finding.quote: - print('Quote: {}'.format(finding.quote)) + print("Quote: {}".format(finding.quote)) except AttributeError: pass - print('Info type: {}'.format(finding.info_type.name)) - print('Likelihood: {}'.format(finding.likelihood)) + print("Info type: {}".format(finding.info_type.name)) + print("Likelihood: {}".format(finding.likelihood)) else: - print('No findings.') + print("No findings.") + + # [END dlp_inspect_table] # [START dlp_inspect_file] -def inspect_file(project, filename, info_types, min_likelihood=None, - custom_dictionaries=None, custom_regexes=None, - max_findings=None, include_quote=True, mime_type=None): +def inspect_file( + project, + filename, + info_types, + min_likelihood=None, + custom_dictionaries=None, + custom_regexes=None, + max_findings=None, + include_quote=True, + mime_type=None, +): """Uses the Data Loss Prevention API to analyze a file for protected data. Args: project: The Google Cloud project id to use as a parent resource. @@ -254,39 +286,43 @@ def inspect_file(project, filename, info_types, min_likelihood=None, import google.cloud.dlp # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). if not info_types: - info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] - info_types = [{'name': info_type} for info_type in info_types] + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] - dictionaries = [{ - 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, - 'dictionary': { - 'word_list': {'words': custom_dict.split(',')} + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, } - } for i, custom_dict in enumerate(custom_dictionaries)] + for i, custom_dict in enumerate(custom_dictionaries) + ] if custom_regexes is None: custom_regexes = [] - regexes = [{ - 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regex} - } for i, custom_regex in enumerate(custom_regexes)] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { - 'info_types': info_types, - 'custom_info_types': custom_info_types, - 'min_likelihood': min_likelihood, - 'limits': {'max_findings_per_request': max_findings}, + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, } # If mime_type is not specified, guess it from the filename. @@ -297,17 +333,17 @@ def inspect_file(project, filename, info_types, min_likelihood=None, # Select the content type index from the list of supported types. supported_content_types = { None: 0, # "Unspecified" - 'image/jpeg': 1, - 'image/bmp': 2, - 'image/png': 3, - 'image/svg': 4, - 'text/plain': 5, + "image/jpeg": 1, + "image/bmp": 2, + "image/png": 3, + "image/svg": 4, + "text/plain": 5, } content_type_index = supported_content_types.get(mime_type, 0) # Construct the item, containing the file's byte data. - with open(filename, mode='rb') as f: - item = {'byte_item': {'type': content_type_index, 'data': f.read()}} + with open(filename, mode="rb") as f: + item = {"byte_item": {"type": content_type_index, "data": f.read()}} # Convert the project id into a full resource id. parent = dlp.project_path(project) @@ -319,21 +355,32 @@ def inspect_file(project, filename, info_types, min_likelihood=None, if response.result.findings: for finding in response.result.findings: try: - print('Quote: {}'.format(finding.quote)) + print("Quote: {}".format(finding.quote)) except AttributeError: pass - print('Info type: {}'.format(finding.info_type.name)) - print('Likelihood: {}'.format(finding.likelihood)) + print("Info type: {}".format(finding.info_type.name)) + print("Likelihood: {}".format(finding.likelihood)) else: - print('No findings.') + print("No findings.") + + # [END dlp_inspect_file] # [START dlp_inspect_gcs] -def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, - info_types, custom_dictionaries=None, - custom_regexes=None, min_likelihood=None, - max_findings=None, timeout=300): +def inspect_gcs_file( + project, + bucket, + filename, + topic_id, + subscription_id, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + timeout=300, +): """Uses the Data Loss Prevention API to analyze a file on GCS. Args: project: The Google Cloud project id to use as a parent resource. @@ -367,62 +414,60 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, import threading # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). if not info_types: - info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] - info_types = [{'name': info_type} for info_type in info_types] + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] - dictionaries = [{ - 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, - 'dictionary': { - 'word_list': {'words': custom_dict.split(',')} + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, } - } for i, custom_dict in enumerate(custom_dictionaries)] + for i, custom_dict in enumerate(custom_dictionaries) + ] if custom_regexes is None: custom_regexes = [] - regexes = [{ - 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regex} - } for i, custom_regex in enumerate(custom_regexes)] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { - 'info_types': info_types, - 'custom_info_types': custom_info_types, - 'min_likelihood': min_likelihood, - 'limits': {'max_findings_per_request': max_findings}, + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, } # Construct a storage_config containing the file's URL. - url = 'gs://{}/{}'.format(bucket, filename) - storage_config = { - 'cloud_storage_options': { - 'file_set': {'url': url} - } - } + url = "gs://{}/{}".format(bucket, filename) + storage_config = {"cloud_storage_options": {"file_set": {"url": url}}} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Tell the API where to send a notification when the job is complete. - actions = [{ - 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} - }] + actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] # Construct the inspect_job, which defines the entire inspect content task. inspect_job = { - 'inspect_config': inspect_config, - 'storage_config': storage_config, - 'actions': actions, + "inspect_config": inspect_config, + "storage_config": storage_config, + "actions": actions, } operation = dlp.create_dlp_job(parent, inspect_job=inspect_job) @@ -430,8 +475,7 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path( - project, subscription_id) + subscription_path = subscriber.subscription_path(project, subscription_id) # Set up a callback to acknowledge a message. This closes around an event # so that it can signal that it is done and the main thread can continue. @@ -439,7 +483,7 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, def callback(message): try: - if (message.attributes['DlpJobName'] == operation.name): + if message.attributes["DlpJobName"] == operation.name: # This is the message we're looking for, so acknowledge it. message.ack() @@ -447,10 +491,13 @@ def callback(message): job = dlp.get_dlp_job(operation.name) if job.inspect_details.result.info_type_stats: for finding in job.inspect_details.result.info_type_stats: - print('Info type: {}; Count: {}'.format( - finding.info_type.name, finding.count)) + print( + "Info type: {}; Count: {}".format( + finding.info_type.name, finding.count + ) + ) else: - print('No findings.') + print("No findings.") # Signal to the main thread that we can exit. job_done.set() @@ -466,18 +513,30 @@ def callback(message): subscriber.subscribe(subscription_path, callback=callback) finished = job_done.wait(timeout=timeout) if not finished: - print('No event received before the timeout. Please verify that the ' - 'subscription provided is subscribed to the topic provided.') + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + # [END dlp_inspect_gcs] # [START dlp_inspect_datastore] -def inspect_datastore(project, datastore_project, kind, - topic_id, subscription_id, info_types, - custom_dictionaries=None, custom_regexes=None, - namespace_id=None, min_likelihood=None, - max_findings=None, timeout=300): +def inspect_datastore( + project, + datastore_project, + kind, + topic_id, + subscription_id, + info_types, + custom_dictionaries=None, + custom_regexes=None, + namespace_id=None, + min_likelihood=None, + max_findings=None, + timeout=300, +): """Uses the Data Loss Prevention API to analyze Datastore data. Args: project: The Google Cloud project id to use as a parent resource. @@ -511,51 +570,53 @@ def inspect_datastore(project, datastore_project, kind, import threading # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). if not info_types: - info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] - info_types = [{'name': info_type} for info_type in info_types] + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] - dictionaries = [{ - 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, - 'dictionary': { - 'word_list': {'words': custom_dict.split(',')} + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, } - } for i, custom_dict in enumerate(custom_dictionaries)] + for i, custom_dict in enumerate(custom_dictionaries) + ] if custom_regexes is None: custom_regexes = [] - regexes = [{ - 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regex} - } for i, custom_regex in enumerate(custom_regexes)] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { - 'info_types': info_types, - 'custom_info_types': custom_info_types, - 'min_likelihood': min_likelihood, - 'limits': {'max_findings_per_request': max_findings}, + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, } # Construct a storage_config containing the target Datastore info. storage_config = { - 'datastore_options': { - 'partition_id': { - 'project_id': datastore_project, - 'namespace_id': namespace_id, - }, - 'kind': { - 'name': kind + "datastore_options": { + "partition_id": { + "project_id": datastore_project, + "namespace_id": namespace_id, }, + "kind": {"name": kind}, } } @@ -563,15 +624,13 @@ def inspect_datastore(project, datastore_project, kind, parent = dlp.project_path(project) # Tell the API where to send a notification when the job is complete. - actions = [{ - 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} - }] + actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] # Construct the inspect_job, which defines the entire inspect content task. inspect_job = { - 'inspect_config': inspect_config, - 'storage_config': storage_config, - 'actions': actions, + "inspect_config": inspect_config, + "storage_config": storage_config, + "actions": actions, } operation = dlp.create_dlp_job(parent, inspect_job=inspect_job) @@ -579,8 +638,7 @@ def inspect_datastore(project, datastore_project, kind, # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path( - project, subscription_id) + subscription_path = subscriber.subscription_path(project, subscription_id) # Set up a callback to acknowledge a message. This closes around an event # so that it can signal that it is done and the main thread can continue. @@ -588,7 +646,7 @@ def inspect_datastore(project, datastore_project, kind, def callback(message): try: - if (message.attributes['DlpJobName'] == operation.name): + if message.attributes["DlpJobName"] == operation.name: # This is the message we're looking for, so acknowledge it. message.ack() @@ -596,10 +654,13 @@ def callback(message): job = dlp.get_dlp_job(operation.name) if job.inspect_details.result.info_type_stats: for finding in job.inspect_details.result.info_type_stats: - print('Info type: {}; Count: {}'.format( - finding.info_type.name, finding.count)) + print( + "Info type: {}; Count: {}".format( + finding.info_type.name, finding.count + ) + ) else: - print('No findings.') + print("No findings.") # Signal to the main thread that we can exit. job_done.set() @@ -617,17 +678,30 @@ def callback(message): finished = job_done.wait(timeout=timeout) if not finished: - print('No event received before the timeout. Please verify that the ' - 'subscription provided is subscribed to the topic provided.') + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + # [END dlp_inspect_datastore] # [START dlp_inspect_bigquery] -def inspect_bigquery(project, bigquery_project, dataset_id, table_id, - topic_id, subscription_id, info_types, - custom_dictionaries=None, custom_regexes=None, - min_likelihood=None, max_findings=None, timeout=300): +def inspect_bigquery( + project, + bigquery_project, + dataset_id, + table_id, + topic_id, + subscription_id, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + timeout=300, +): """Uses the Data Loss Prevention API to analyze BigQuery data. Args: project: The Google Cloud project id to use as a parent resource. @@ -662,48 +736,52 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id, import threading # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). if not info_types: - info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] - info_types = [{'name': info_type} for info_type in info_types] + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: custom_dictionaries = [] - dictionaries = [{ - 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, - 'dictionary': { - 'word_list': {'words': custom_dict.split(',')} + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, } - } for i, custom_dict in enumerate(custom_dictionaries)] + for i, custom_dict in enumerate(custom_dictionaries) + ] if custom_regexes is None: custom_regexes = [] - regexes = [{ - 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regex} - } for i, custom_regex in enumerate(custom_regexes)] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { - 'info_types': info_types, - 'custom_info_types': custom_info_types, - 'min_likelihood': min_likelihood, - 'limits': {'max_findings_per_request': max_findings}, + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, } # Construct a storage_config containing the target Bigquery info. storage_config = { - 'big_query_options': { - 'table_reference': { - 'project_id': bigquery_project, - 'dataset_id': dataset_id, - 'table_id': table_id, + "big_query_options": { + "table_reference": { + "project_id": bigquery_project, + "dataset_id": dataset_id, + "table_id": table_id, } } } @@ -712,15 +790,13 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id, parent = dlp.project_path(project) # Tell the API where to send a notification when the job is complete. - actions = [{ - 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} - }] + actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] # Construct the inspect_job, which defines the entire inspect content task. inspect_job = { - 'inspect_config': inspect_config, - 'storage_config': storage_config, - 'actions': actions, + "inspect_config": inspect_config, + "storage_config": storage_config, + "actions": actions, } operation = dlp.create_dlp_job(parent, inspect_job=inspect_job) @@ -728,8 +804,7 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id, # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path( - project, subscription_id) + subscription_path = subscriber.subscription_path(project, subscription_id) # Set up a callback to acknowledge a message. This closes around an event # so that it can signal that it is done and the main thread can continue. @@ -737,7 +812,7 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id, def callback(message): try: - if (message.attributes['DlpJobName'] == operation.name): + if message.attributes["DlpJobName"] == operation.name: # This is the message we're looking for, so acknowledge it. message.ack() @@ -745,10 +820,13 @@ def callback(message): job = dlp.get_dlp_job(operation.name) if job.inspect_details.result.info_type_stats: for finding in job.inspect_details.result.info_type_stats: - print('Info type: {}; Count: {}'.format( - finding.info_type.name, finding.count)) + print( + "Info type: {}; Count: {}".format( + finding.info_type.name, finding.count + ) + ) else: - print('No findings.') + print("No findings.") # Signal to the main thread that we can exit. job_done.set() @@ -765,369 +843,515 @@ def callback(message): subscriber.subscribe(subscription_path, callback=callback) finished = job_done.wait(timeout=timeout) if not finished: - print('No event received before the timeout. Please verify that the ' - 'subscription provided is subscribed to the topic provided.') + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + # [END dlp_inspect_bigquery] -if __name__ == '__main__': - default_project = os.environ.get('GCLOUD_PROJECT') +if __name__ == "__main__": + default_project = os.environ.get("GCLOUD_PROJECT") parser = argparse.ArgumentParser(description=__doc__) subparsers = parser.add_subparsers( - dest='content', help='Select how to submit content to the API.') + dest="content", help="Select how to submit content to the API." + ) subparsers.required = True - parser_string = subparsers.add_parser('string', help='Inspect a string.') - parser_string.add_argument('item', help='The string to inspect.') + parser_string = subparsers.add_parser("string", help="Inspect a string.") + parser_string.add_argument("item", help="The string to inspect.") parser_string.add_argument( - '--project', - help='The Google Cloud project id to use as a parent resource.', - default=default_project) + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) parser_string.add_argument( - '--info_types', nargs='+', - help='Strings representing info types to look for. A full list of ' - 'info categories and types is available from the API. Examples ' - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - 'If unspecified, the three above examples will be used.', - default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) parser_string.add_argument( - '--custom_dictionaries', action='append', - help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types. Each string is a comma ' - 'delimited list of words representing a distinct dictionary.', - default=None) + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) parser_string.add_argument( - '--custom_regexes', action='append', - help='Strings representing regex patterns to search for as custom ' - ' info types.', - default=None) + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) parser_string.add_argument( - '--min_likelihood', - choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', - 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], - help='A string representing the minimum likelihood threshold that ' - 'constitutes a match.') + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) parser_string.add_argument( - '--max_findings', type=int, - help='The maximum number of findings to report; 0 = no maximum.') + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) parser_string.add_argument( - '--include_quote', type=bool, - help='A boolean for whether to display a quote of the detected ' - 'information in the results.', - default=True) - - parser_table = subparsers.add_parser('table', help='Inspect a table.') + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) + + parser_table = subparsers.add_parser("table", help="Inspect a table.") parser_table.add_argument( - 'data', help='Json string representing a table.', type=json.loads) + "data", help="Json string representing a table.", type=json.loads + ) parser_table.add_argument( - '--project', - help='The Google Cloud project id to use as a parent resource.', - default=default_project) + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) parser_table.add_argument( - '--info_types', action='append', - help='Strings representing info types to look for. A full list of ' - 'info categories and types is available from the API. Examples ' - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - 'If unspecified, the three above examples will be used.', - default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) parser_table.add_argument( - '--custom_dictionaries', action='append', - help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types. Each string is a comma ' - 'delimited list of words representing a distinct dictionary.', - default=None) + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) parser_table.add_argument( - '--custom_regexes', action='append', - help='Strings representing regex patterns to search for as custom ' - ' info types.', - default=None) + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) parser_table.add_argument( - '--min_likelihood', - choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', - 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], - help='A string representing the minimum likelihood threshold that ' - 'constitutes a match.') + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) parser_table.add_argument( - '--max_findings', type=int, - help='The maximum number of findings to report; 0 = no maximum.') + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) parser_table.add_argument( - '--include_quote', type=bool, - help='A boolean for whether to display a quote of the detected ' - 'information in the results.', - default=True) - - parser_file = subparsers.add_parser('file', help='Inspect a local file.') + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) + + parser_file = subparsers.add_parser("file", help="Inspect a local file.") + parser_file.add_argument("filename", help="The path to the file to inspect.") parser_file.add_argument( - 'filename', help='The path to the file to inspect.') + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) parser_file.add_argument( - '--project', - help='The Google Cloud project id to use as a parent resource.', - default=default_project) + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) parser_file.add_argument( - '--info_types', action='append', - help='Strings representing info types to look for. A full list of ' - 'info categories and types is available from the API. Examples ' - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - 'If unspecified, the three above examples will be used.', - default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) parser_file.add_argument( - '--custom_dictionaries', action='append', - help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types. Each string is a comma ' - 'delimited list of words representing a distinct dictionary.', - default=None) + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) parser_file.add_argument( - '--custom_regexes', action='append', - help='Strings representing regex patterns to search for as custom ' - ' info types.', - default=None) + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) parser_file.add_argument( - '--min_likelihood', - choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', - 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], - help='A string representing the minimum likelihood threshold that ' - 'constitutes a match.') + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) parser_file.add_argument( - '--max_findings', type=int, - help='The maximum number of findings to report; 0 = no maximum.') + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) parser_file.add_argument( - '--include_quote', type=bool, - help='A boolean for whether to display a quote of the detected ' - 'information in the results.', - default=True) - parser_file.add_argument( - '--mime_type', - help='The MIME type of the file. If not specified, the type is ' - 'inferred via the Python standard library\'s mimetypes module.') + "--mime_type", + help="The MIME type of the file. If not specified, the type is " + "inferred via the Python standard library's mimetypes module.", + ) parser_gcs = subparsers.add_parser( - 'gcs', help='Inspect files on Google Cloud Storage.') + "gcs", help="Inspect files on Google Cloud Storage." + ) parser_gcs.add_argument( - 'bucket', help='The name of the GCS bucket containing the file.') + "bucket", help="The name of the GCS bucket containing the file." + ) parser_gcs.add_argument( - 'filename', - help='The name of the file in the bucket, including the path, e.g. ' - '"images/myfile.png". Wildcards are permitted.') + "filename", + help="The name of the file in the bucket, including the path, e.g. " + '"images/myfile.png". Wildcards are permitted.', + ) parser_gcs.add_argument( - 'topic_id', - help='The id of the Cloud Pub/Sub topic to use to report that the job ' - 'is complete, e.g. "dlp-sample-topic".') + "topic_id", + help="The id of the Cloud Pub/Sub topic to use to report that the job " + 'is complete, e.g. "dlp-sample-topic".', + ) parser_gcs.add_argument( - 'subscription_id', - help='The id of the Cloud Pub/Sub subscription to monitor for job ' + "subscription_id", + help="The id of the Cloud Pub/Sub subscription to monitor for job " 'completion, e.g. "dlp-sample-subscription". The subscription must ' - 'already be subscribed to the topic. See the test files or the Cloud ' - 'Pub/Sub sample files for examples on how to create the subscription.') + "already be subscribed to the topic. See the test files or the Cloud " + "Pub/Sub sample files for examples on how to create the subscription.", + ) parser_gcs.add_argument( - '--project', - help='The Google Cloud project id to use as a parent resource.', - default=default_project) + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) parser_gcs.add_argument( - '--info_types', action='append', - help='Strings representing info types to look for. A full list of ' - 'info categories and types is available from the API. Examples ' - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - 'If unspecified, the three above examples will be used.', - default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) parser_gcs.add_argument( - '--custom_dictionaries', action='append', - help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types. Each string is a comma ' - 'delimited list of words representing a distinct dictionary.', - default=None) + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) parser_gcs.add_argument( - '--custom_regexes', action='append', - help='Strings representing regex patterns to search for as custom ' - ' info types.', - default=None) + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) parser_gcs.add_argument( - '--min_likelihood', - choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', - 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], - help='A string representing the minimum likelihood threshold that ' - 'constitutes a match.') + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) parser_gcs.add_argument( - '--max_findings', type=int, - help='The maximum number of findings to report; 0 = no maximum.') + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) parser_gcs.add_argument( - '--timeout', type=int, - help='The maximum number of seconds to wait for a response from the ' - 'API. The default is 300 seconds.', - default=300) + "--timeout", + type=int, + help="The maximum number of seconds to wait for a response from the " + "API. The default is 300 seconds.", + default=300, + ) parser_datastore = subparsers.add_parser( - 'datastore', help='Inspect files on Google Datastore.') + "datastore", help="Inspect files on Google Datastore." + ) parser_datastore.add_argument( - 'datastore_project', - help='The Google Cloud project id of the target Datastore.') + "datastore_project", help="The Google Cloud project id of the target Datastore." + ) parser_datastore.add_argument( - 'kind', - help='The kind of the Datastore entity to inspect, e.g. "Person".') + "kind", help='The kind of the Datastore entity to inspect, e.g. "Person".' + ) parser_datastore.add_argument( - 'topic_id', - help='The id of the Cloud Pub/Sub topic to use to report that the job ' - 'is complete, e.g. "dlp-sample-topic".') + "topic_id", + help="The id of the Cloud Pub/Sub topic to use to report that the job " + 'is complete, e.g. "dlp-sample-topic".', + ) parser_datastore.add_argument( - 'subscription_id', - help='The id of the Cloud Pub/Sub subscription to monitor for job ' + "subscription_id", + help="The id of the Cloud Pub/Sub subscription to monitor for job " 'completion, e.g. "dlp-sample-subscription". The subscription must ' - 'already be subscribed to the topic. See the test files or the Cloud ' - 'Pub/Sub sample files for examples on how to create the subscription.') + "already be subscribed to the topic. See the test files or the Cloud " + "Pub/Sub sample files for examples on how to create the subscription.", + ) parser_datastore.add_argument( - '--project', - help='The Google Cloud project id to use as a parent resource.', - default=default_project) + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) parser_datastore.add_argument( - '--info_types', action='append', - help='Strings representing info types to look for. A full list of ' - 'info categories and types is available from the API. Examples ' - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - 'If unspecified, the three above examples will be used.', - default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) parser_datastore.add_argument( - '--custom_dictionaries', action='append', - help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types. Each string is a comma ' - 'delimited list of words representing a distinct dictionary.', - default=None) + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) parser_datastore.add_argument( - '--custom_regexes', action='append', - help='Strings representing regex patterns to search for as custom ' - ' info types.', - default=None) + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) parser_datastore.add_argument( - '--namespace_id', - help='The Datastore namespace to use, if applicable.') + "--namespace_id", help="The Datastore namespace to use, if applicable." + ) parser_datastore.add_argument( - '--min_likelihood', - choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', - 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], - help='A string representing the minimum likelihood threshold that ' - 'constitutes a match.') + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) parser_datastore.add_argument( - '--max_findings', type=int, - help='The maximum number of findings to report; 0 = no maximum.') + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) parser_datastore.add_argument( - '--timeout', type=int, - help='The maximum number of seconds to wait for a response from the ' - 'API. The default is 300 seconds.', - default=300) + "--timeout", + type=int, + help="The maximum number of seconds to wait for a response from the " + "API. The default is 300 seconds.", + default=300, + ) parser_bigquery = subparsers.add_parser( - 'bigquery', help='Inspect files on Google BigQuery.') + "bigquery", help="Inspect files on Google BigQuery." + ) parser_bigquery.add_argument( - 'bigquery_project', - help='The Google Cloud project id of the target table.') + "bigquery_project", help="The Google Cloud project id of the target table." + ) parser_bigquery.add_argument( - 'dataset_id', - help='The ID of the target BigQuery dataset.') + "dataset_id", help="The ID of the target BigQuery dataset." + ) parser_bigquery.add_argument( - 'table_id', - help='The ID of the target BigQuery table.') + "table_id", help="The ID of the target BigQuery table." + ) parser_bigquery.add_argument( - 'topic_id', - help='The id of the Cloud Pub/Sub topic to use to report that the job ' - 'is complete, e.g. "dlp-sample-topic".') + "topic_id", + help="The id of the Cloud Pub/Sub topic to use to report that the job " + 'is complete, e.g. "dlp-sample-topic".', + ) parser_bigquery.add_argument( - 'subscription_id', - help='The id of the Cloud Pub/Sub subscription to monitor for job ' + "subscription_id", + help="The id of the Cloud Pub/Sub subscription to monitor for job " 'completion, e.g. "dlp-sample-subscription". The subscription must ' - 'already be subscribed to the topic. See the test files or the Cloud ' - 'Pub/Sub sample files for examples on how to create the subscription.') + "already be subscribed to the topic. See the test files or the Cloud " + "Pub/Sub sample files for examples on how to create the subscription.", + ) parser_bigquery.add_argument( - '--project', - help='The Google Cloud project id to use as a parent resource.', - default=default_project) + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) parser_bigquery.add_argument( - '--info_types', nargs='+', - help='Strings representing info types to look for. A full list of ' - 'info categories and types is available from the API. Examples ' - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - 'If unspecified, the three above examples will be used.', - default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) parser_bigquery.add_argument( - '--custom_dictionaries', action='append', - help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types. Each string is a comma ' - 'delimited list of words representing a distinct dictionary.', - default=None) + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) parser_bigquery.add_argument( - '--custom_regexes', action='append', - help='Strings representing regex patterns to search for as custom ' - ' info types.', - default=None) + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) parser_bigquery.add_argument( - '--min_likelihood', - choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', - 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], - help='A string representing the minimum likelihood threshold that ' - 'constitutes a match.') + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) parser_bigquery.add_argument( - '--max_findings', type=int, - help='The maximum number of findings to report; 0 = no maximum.') + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) parser_bigquery.add_argument( - '--timeout', type=int, - help='The maximum number of seconds to wait for a response from the ' - 'API. The default is 300 seconds.', - default=300) + "--timeout", + type=int, + help="The maximum number of seconds to wait for a response from the " + "API. The default is 300 seconds.", + default=300, + ) args = parser.parse_args() - if args.content == 'string': + if args.content == "string": inspect_string( - args.project, args.item, args.info_types, + args.project, + args.item, + args.info_types, custom_dictionaries=args.custom_dictionaries, custom_regexes=args.custom_regexes, min_likelihood=args.min_likelihood, max_findings=args.max_findings, - include_quote=args.include_quote) - elif args.content == 'table': + include_quote=args.include_quote, + ) + elif args.content == "table": inspect_table( - args.project, args.data, args.info_types, + args.project, + args.data, + args.info_types, custom_dictionaries=args.custom_dictionaries, custom_regexes=args.custom_regexes, min_likelihood=args.min_likelihood, max_findings=args.max_findings, - include_quote=args.include_quote) - elif args.content == 'file': + include_quote=args.include_quote, + ) + elif args.content == "file": inspect_file( - args.project, args.filename, args.info_types, + args.project, + args.filename, + args.info_types, custom_dictionaries=args.custom_dictionaries, custom_regexes=args.custom_regexes, min_likelihood=args.min_likelihood, max_findings=args.max_findings, include_quote=args.include_quote, - mime_type=args.mime_type) - elif args.content == 'gcs': + mime_type=args.mime_type, + ) + elif args.content == "gcs": inspect_gcs_file( - args.project, args.bucket, args.filename, - args.topic_id, args.subscription_id, + args.project, + args.bucket, + args.filename, + args.topic_id, + args.subscription_id, args.info_types, custom_dictionaries=args.custom_dictionaries, custom_regexes=args.custom_regexes, min_likelihood=args.min_likelihood, max_findings=args.max_findings, - timeout=args.timeout) - elif args.content == 'datastore': + timeout=args.timeout, + ) + elif args.content == "datastore": inspect_datastore( - args.project, args.datastore_project, args.kind, - args.topic_id, args.subscription_id, + args.project, + args.datastore_project, + args.kind, + args.topic_id, + args.subscription_id, args.info_types, custom_dictionaries=args.custom_dictionaries, custom_regexes=args.custom_regexes, namespace_id=args.namespace_id, min_likelihood=args.min_likelihood, max_findings=args.max_findings, - timeout=args.timeout) - elif args.content == 'bigquery': + timeout=args.timeout, + ) + elif args.content == "bigquery": inspect_bigquery( - args.project, args.bigquery_project, args.dataset_id, - args.table_id, args.topic_id, args.subscription_id, + args.project, + args.bigquery_project, + args.dataset_id, + args.table_id, + args.topic_id, + args.subscription_id, args.info_types, custom_dictionaries=args.custom_dictionaries, custom_regexes=args.custom_regexes, min_likelihood=args.min_likelihood, max_findings=args.max_findings, - timeout=args.timeout) + timeout=args.timeout, + ) diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py index 8860cf9e7a82..899ed64c3b3c 100644 --- a/dlp/inspect_content_test.py +++ b/dlp/inspect_content_test.py @@ -27,18 +27,18 @@ import inspect_content -GCLOUD_PROJECT = os.getenv('GCLOUD_PROJECT') -TEST_BUCKET_NAME = GCLOUD_PROJECT + '-dlp-python-client-test' -RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), 'resources') -RESOURCE_FILE_NAMES = ['test.txt', 'test.png', 'harmless.txt', 'accounts.txt'] -TOPIC_ID = 'dlp-test' -SUBSCRIPTION_ID = 'dlp-test-subscription' -DATASTORE_KIND = 'DLP test kind' -BIGQUERY_DATASET_ID = 'dlp_test_dataset' -BIGQUERY_TABLE_ID = 'dlp_test_table' - - -@pytest.fixture(scope='module') +GCLOUD_PROJECT = os.getenv("GCLOUD_PROJECT") +TEST_BUCKET_NAME = GCLOUD_PROJECT + "-dlp-python-client-test" +RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources") +RESOURCE_FILE_NAMES = ["test.txt", "test.png", "harmless.txt", "accounts.txt"] +TOPIC_ID = "dlp-test" +SUBSCRIPTION_ID = "dlp-test-subscription" +DATASTORE_KIND = "DLP test kind" +BIGQUERY_DATASET_ID = "dlp_test_dataset" +BIGQUERY_TABLE_ID = "dlp_test_table" + + +@pytest.fixture(scope="module") def bucket(): # Creates a GCS bucket, uploads files required for the test, and tears down # the entire bucket afterwards. @@ -62,13 +62,16 @@ def bucket(): # Delete the files. for blob in blobs: - blob.delete() + try: + blob.delete() + except google.cloud.exceptions.NotFound: + print("Issue during teardown, missing blob") # Attempt to delete the bucket; this will only work if it is empty. bucket.delete() -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def topic_id(): # Creates a pubsub topic, and tears it down. publisher = google.cloud.pubsub.PublisherClient() @@ -83,13 +86,12 @@ def topic_id(): publisher.delete_topic(topic_path) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def subscription_id(topic_id): # Subscribes to a topic. subscriber = google.cloud.pubsub.SubscriberClient() topic_path = subscriber.topic_path(GCLOUD_PROJECT, topic_id) - subscription_path = subscriber.subscription_path( - GCLOUD_PROJECT, SUBSCRIPTION_ID) + subscription_path = subscriber.subscription_path(GCLOUD_PROJECT, SUBSCRIPTION_ID) try: subscriber.create_subscription(subscription_path, topic_path) except google.api_core.exceptions.AlreadyExists: @@ -100,16 +102,16 @@ def subscription_id(topic_id): subscriber.delete_subscription(subscription_path) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def datastore_project(): # Adds test Datastore data, yields the project ID and then tears down. datastore_client = google.cloud.datastore.Client() kind = DATASTORE_KIND - name = 'DLP test object' + name = "DLP test object" key = datastore_client.key(kind, name) item = google.cloud.datastore.Entity(key=key) - item['payload'] = 'My name is Gary Smith and my email is gary@example.com' + item["payload"] = "My name is Gary Smith and my email is gary@example.com" datastore_client.put(item) @@ -118,7 +120,7 @@ def datastore_project(): datastore_client.delete(key) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def bigquery_project(): # Adds test Bigquery data, yields the project ID and then tears down. bigquery_client = google.cloud.bigquery.Client() @@ -135,8 +137,8 @@ def bigquery_project(): # DO NOT SUBMIT: trim this down once we find out what works table.schema = ( - google.cloud.bigquery.SchemaField('Name', 'STRING'), - google.cloud.bigquery.SchemaField('Comment', 'STRING'), + google.cloud.bigquery.SchemaField("Name", "STRING"), + google.cloud.bigquery.SchemaField("Comment", "STRING"), ) try: @@ -144,9 +146,7 @@ def bigquery_project(): except google.api_core.exceptions.Conflict: table = bigquery_client.get_table(table) - rows_to_insert = [ - (u'Gary Smith', u'My email is gary@example.com',) - ] + rows_to_insert = [(u"Gary Smith", u"My email is gary@example.com")] bigquery_client.insert_rows(table, rows_to_insert) @@ -156,52 +156,42 @@ def bigquery_project(): def test_inspect_string(capsys): - test_string = 'My name is Gary Smith and my email is gary@example.com' + test_string = "My name is Gary Smith and my email is gary@example.com" inspect_content.inspect_string( - GCLOUD_PROJECT, - test_string, - ['FIRST_NAME', 'EMAIL_ADDRESS'], - include_quote=True) + GCLOUD_PROJECT, test_string, ["FIRST_NAME", "EMAIL_ADDRESS"], include_quote=True + ) out, _ = capsys.readouterr() - assert 'Info type: FIRST_NAME' in out - assert 'Info type: EMAIL_ADDRESS' in out + assert "Info type: FIRST_NAME" in out + assert "Info type: EMAIL_ADDRESS" in out def test_inspect_table(capsys): test_tabular_data = { - "header": [ - "email", - "phone number" - ], + "header": ["email", "phone number"], "rows": [ - [ - "robertfrost@xyz.com", - "4232342345" - ], - [ - "johndoe@pqr.com", - "4253458383" - ] - ] + ["robertfrost@xyz.com", "4232342345"], + ["johndoe@pqr.com", "4253458383"], + ], } inspect_content.inspect_table( GCLOUD_PROJECT, test_tabular_data, - ['PHONE_NUMBER', 'EMAIL_ADDRESS'], - include_quote=True) + ["PHONE_NUMBER", "EMAIL_ADDRESS"], + include_quote=True, + ) out, _ = capsys.readouterr() - assert 'Info type: PHONE_NUMBER' in out - assert 'Info type: EMAIL_ADDRESS' in out + assert "Info type: PHONE_NUMBER" in out + assert "Info type: EMAIL_ADDRESS" in out def test_inspect_string_with_custom_info_types(capsys): - test_string = 'My name is Gary Smith and my email is gary@example.com' - dictionaries = ['Gary Smith'] - regexes = ['\\w+@\\w+.com'] + test_string = "My name is Gary Smith and my email is gary@example.com" + dictionaries = ["Gary Smith"] + regexes = ["\\w+@\\w+.com"] inspect_content.inspect_string( GCLOUD_PROJECT, @@ -209,43 +199,43 @@ def test_inspect_string_with_custom_info_types(capsys): [], custom_dictionaries=dictionaries, custom_regexes=regexes, - include_quote=True) + include_quote=True, + ) out, _ = capsys.readouterr() - assert 'Info type: CUSTOM_DICTIONARY_0' in out - assert 'Info type: CUSTOM_REGEX_0' in out + assert "Info type: CUSTOM_DICTIONARY_0" in out + assert "Info type: CUSTOM_REGEX_0" in out def test_inspect_string_no_results(capsys): - test_string = 'Nothing to see here' + test_string = "Nothing to see here" inspect_content.inspect_string( - GCLOUD_PROJECT, - test_string, - ['FIRST_NAME', 'EMAIL_ADDRESS'], - include_quote=True) + GCLOUD_PROJECT, test_string, ["FIRST_NAME", "EMAIL_ADDRESS"], include_quote=True + ) out, _ = capsys.readouterr() - assert 'No findings' in out + assert "No findings" in out def test_inspect_file(capsys): - test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.txt') + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.txt") inspect_content.inspect_file( GCLOUD_PROJECT, test_filepath, - ['FIRST_NAME', 'EMAIL_ADDRESS'], - include_quote=True) + ["FIRST_NAME", "EMAIL_ADDRESS"], + include_quote=True, + ) out, _ = capsys.readouterr() - assert 'Info type: EMAIL_ADDRESS' in out + assert "Info type: EMAIL_ADDRESS" in out def test_inspect_file_with_custom_info_types(capsys): - test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.txt') - dictionaries = ['gary@somedomain.com'] - regexes = ['\\(\\d{3}\\) \\d{3}-\\d{4}'] + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.txt") + dictionaries = ["gary@somedomain.com"] + regexes = ["\\(\\d{3}\\) \\d{3}-\\d{4}"] inspect_content.inspect_file( GCLOUD_PROJECT, @@ -253,37 +243,40 @@ def test_inspect_file_with_custom_info_types(capsys): [], custom_dictionaries=dictionaries, custom_regexes=regexes, - include_quote=True) + include_quote=True, + ) out, _ = capsys.readouterr() - assert 'Info type: CUSTOM_DICTIONARY_0' in out - assert 'Info type: CUSTOM_REGEX_0' in out + assert "Info type: CUSTOM_DICTIONARY_0" in out + assert "Info type: CUSTOM_REGEX_0" in out def test_inspect_file_no_results(capsys): - test_filepath = os.path.join(RESOURCE_DIRECTORY, 'harmless.txt') + test_filepath = os.path.join(RESOURCE_DIRECTORY, "harmless.txt") inspect_content.inspect_file( GCLOUD_PROJECT, test_filepath, - ['FIRST_NAME', 'EMAIL_ADDRESS'], - include_quote=True) + ["FIRST_NAME", "EMAIL_ADDRESS"], + include_quote=True, + ) out, _ = capsys.readouterr() - assert 'No findings' in out + assert "No findings" in out def test_inspect_image_file(capsys): - test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png') + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") inspect_content.inspect_file( GCLOUD_PROJECT, test_filepath, - ['FIRST_NAME', 'EMAIL_ADDRESS', 'PHONE_NUMBER'], - include_quote=True) + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + include_quote=True, + ) out, _ = capsys.readouterr() - assert 'Info type: PHONE_NUMBER' in out + assert "Info type: PHONE_NUMBER" in out @flaky @@ -291,66 +284,70 @@ def test_inspect_gcs_file(bucket, topic_id, subscription_id, capsys): inspect_content.inspect_gcs_file( GCLOUD_PROJECT, bucket.name, - 'test.txt', + "test.txt", topic_id, subscription_id, - ['FIRST_NAME', 'EMAIL_ADDRESS', 'PHONE_NUMBER'], - timeout=420) + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=420, + ) out, _ = capsys.readouterr() - assert 'Info type: EMAIL_ADDRESS' in out + assert "Info type: EMAIL_ADDRESS" in out @flaky -def test_inspect_gcs_file_with_custom_info_types(bucket, topic_id, - subscription_id, capsys): - dictionaries = ['gary@somedomain.com'] - regexes = ['\\(\\d{3}\\) \\d{3}-\\d{4}'] +def test_inspect_gcs_file_with_custom_info_types( + bucket, topic_id, subscription_id, capsys +): + dictionaries = ["gary@somedomain.com"] + regexes = ["\\(\\d{3}\\) \\d{3}-\\d{4}"] inspect_content.inspect_gcs_file( GCLOUD_PROJECT, bucket.name, - 'test.txt', + "test.txt", topic_id, subscription_id, [], custom_dictionaries=dictionaries, custom_regexes=regexes, - timeout=420) + timeout=420, + ) out, _ = capsys.readouterr() - assert 'Info type: CUSTOM_DICTIONARY_0' in out - assert 'Info type: CUSTOM_REGEX_0' in out + assert "Info type: CUSTOM_DICTIONARY_0" in out + assert "Info type: CUSTOM_REGEX_0" in out @flaky -def test_inspect_gcs_file_no_results( - bucket, topic_id, subscription_id, capsys): +def test_inspect_gcs_file_no_results(bucket, topic_id, subscription_id, capsys): inspect_content.inspect_gcs_file( GCLOUD_PROJECT, bucket.name, - 'harmless.txt', + "harmless.txt", topic_id, subscription_id, - ['FIRST_NAME', 'EMAIL_ADDRESS', 'PHONE_NUMBER'], - timeout=420) + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=420, + ) out, _ = capsys.readouterr() - assert 'No findings' in out + assert "No findings" in out -@pytest.mark.skip(reason='nondeterministically failing') +@pytest.mark.skip(reason="nondeterministically failing") def test_inspect_gcs_image_file(bucket, topic_id, subscription_id, capsys): inspect_content.inspect_gcs_file( GCLOUD_PROJECT, bucket.name, - 'test.png', + "test.png", topic_id, subscription_id, - ['FIRST_NAME', 'EMAIL_ADDRESS', 'PHONE_NUMBER']) + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + ) out, _ = capsys.readouterr() - assert 'Info type: EMAIL_ADDRESS' in out + assert "Info type: EMAIL_ADDRESS" in out @flaky @@ -358,19 +355,19 @@ def test_inspect_gcs_multiple_files(bucket, topic_id, subscription_id, capsys): inspect_content.inspect_gcs_file( GCLOUD_PROJECT, bucket.name, - '*', + "*", topic_id, subscription_id, - ['FIRST_NAME', 'EMAIL_ADDRESS', 'PHONE_NUMBER']) + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + ) out, _ = capsys.readouterr() - assert 'Info type: EMAIL_ADDRESS' in out - assert 'Info type: PHONE_NUMBER' in out + assert "Info type: EMAIL_ADDRESS" in out + assert "Info type: PHONE_NUMBER" in out @flaky -def test_inspect_datastore( - datastore_project, topic_id, subscription_id, capsys): +def test_inspect_datastore(datastore_project, topic_id, subscription_id, capsys): @eventually_consistent.call def _(): inspect_content.inspect_datastore( @@ -379,30 +376,32 @@ def _(): DATASTORE_KIND, topic_id, subscription_id, - ['FIRST_NAME', 'EMAIL_ADDRESS', 'PHONE_NUMBER']) + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + ) out, _ = capsys.readouterr() - assert 'Info type: EMAIL_ADDRESS' in out + assert "Info type: EMAIL_ADDRESS" in out @flaky def test_inspect_datastore_no_results( - datastore_project, topic_id, subscription_id, capsys): + datastore_project, topic_id, subscription_id, capsys +): inspect_content.inspect_datastore( GCLOUD_PROJECT, datastore_project, DATASTORE_KIND, topic_id, subscription_id, - ['PHONE_NUMBER']) + ["PHONE_NUMBER"], + ) out, _ = capsys.readouterr() - assert 'No findings' in out + assert "No findings" in out -@pytest.mark.skip(reason='unknown issue') -def test_inspect_bigquery( - bigquery_project, topic_id, subscription_id, capsys): +@pytest.mark.skip(reason="unknown issue") +def test_inspect_bigquery(bigquery_project, topic_id, subscription_id, capsys): inspect_content.inspect_bigquery( GCLOUD_PROJECT, bigquery_project, @@ -410,7 +409,8 @@ def test_inspect_bigquery( BIGQUERY_TABLE_ID, topic_id, subscription_id, - ['FIRST_NAME', 'EMAIL_ADDRESS', 'PHONE_NUMBER']) + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + ) out, _ = capsys.readouterr() - assert 'Info type: FIRST_NAME' in out + assert "Info type: FIRST_NAME" in out diff --git a/dlp/jobs.py b/dlp/jobs.py index 43c9c34a3dc3..ec84efbf8f57 100644 --- a/dlp/jobs.py +++ b/dlp/jobs.py @@ -58,32 +58,29 @@ def list_dlp_jobs(project, filter_string=None, job_type=None): import google.cloud.dlp # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Job type dictionary job_type_to_int = { - 'DLP_JOB_TYPE_UNSPECIFIED': - google.cloud.dlp.enums.DlpJobType.DLP_JOB_TYPE_UNSPECIFIED, - 'INSPECT_JOB': google.cloud.dlp.enums.DlpJobType.INSPECT_JOB, - 'RISK_ANALYSIS_JOB': - google.cloud.dlp.enums.DlpJobType.RISK_ANALYSIS_JOB + "DLP_JOB_TYPE_UNSPECIFIED": google.cloud.dlp.enums.DlpJobType.DLP_JOB_TYPE_UNSPECIFIED, + "INSPECT_JOB": google.cloud.dlp.enums.DlpJobType.INSPECT_JOB, + "RISK_ANALYSIS_JOB": google.cloud.dlp.enums.DlpJobType.RISK_ANALYSIS_JOB, } # If job type is specified, convert job type to number through enums. if job_type: job_type = job_type_to_int[job_type] # Call the API to get a list of jobs. - response = dlp.list_dlp_jobs( - parent, - filter_=filter_string, - type_=job_type) + response = dlp.list_dlp_jobs(parent, filter_=filter_string, type_=job_type) # Iterate over results. for job in response: - print('Job: %s; status: %s' % (job.name, job.JobState.Name(job.state))) + print("Job: %s; status: %s" % (job.name, job.JobState.Name(job.state))) + + # [END dlp_list_jobs] @@ -102,7 +99,7 @@ def delete_dlp_job(project, job_name): import google.cloud.dlp # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id and job name into a full resource id. name = dlp.dlp_job_path(project, job_name) @@ -110,49 +107,52 @@ def delete_dlp_job(project, job_name): # Call the API to delete job. dlp.delete_dlp_job(name) - print('Successfully deleted %s' % job_name) + print("Successfully deleted %s" % job_name) + + # [END dlp_delete_job] -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) subparsers = parser.add_subparsers( - dest='content', help='Select how to submit content to the API.') + dest="content", help="Select how to submit content to the API." + ) subparsers.required = True list_parser = subparsers.add_parser( - 'list', - help='List Data Loss Prevention API jobs corresponding to a given ' - 'filter.') + "list", + help="List Data Loss Prevention API jobs corresponding to a given " "filter.", + ) list_parser.add_argument( - 'project', - help='The project id to use as a parent resource.') + "project", help="The project id to use as a parent resource." + ) list_parser.add_argument( - '-f', '--filter', - help='Filter expressions are made up of one or more restrictions.') + "-f", + "--filter", + help="Filter expressions are made up of one or more restrictions.", + ) list_parser.add_argument( - '-t', '--type', - choices=['DLP_JOB_TYPE_UNSPECIFIED', 'INSPECT_JOB', - 'RISK_ANALYSIS_JOB'], - help='The type of job. API defaults to "INSPECT"') + "-t", + "--type", + choices=["DLP_JOB_TYPE_UNSPECIFIED", "INSPECT_JOB", "RISK_ANALYSIS_JOB"], + help='The type of job. API defaults to "INSPECT"', + ) delete_parser = subparsers.add_parser( - 'delete', - help='Delete results of a Data Loss Prevention API job.') + "delete", help="Delete results of a Data Loss Prevention API job." + ) delete_parser.add_argument( - 'project', - help='The project id to use as a parent resource.') + "project", help="The project id to use as a parent resource." + ) delete_parser.add_argument( - 'job_name', - help='The name of the DlpJob resource to be deleted. ' - 'Example: X-#####') + "job_name", + help="The name of the DlpJob resource to be deleted. " "Example: X-#####", + ) args = parser.parse_args() - if args.content == 'list': - list_dlp_jobs( - args.project, - filter_string=args.filter, - job_type=args.type) - elif args.content == 'delete': + if args.content == "list": + list_dlp_jobs(args.project, filter_string=args.filter, job_type=args.type) + elif args.content == "delete": delete_dlp_job(args.project, args.job_name) diff --git a/dlp/jobs_test.py b/dlp/jobs_test.py index 8f47fb4d4280..15417def67c0 100644 --- a/dlp/jobs_test.py +++ b/dlp/jobs_test.py @@ -18,40 +18,37 @@ import jobs -GCLOUD_PROJECT = os.getenv('GCLOUD_PROJECT') -TEST_COLUMN_NAME = 'zip_code' -TEST_TABLE_PROJECT_ID = 'bigquery-public-data' -TEST_DATASET_ID = 'san_francisco' -TEST_TABLE_ID = 'bikeshare_trips' +GCLOUD_PROJECT = os.getenv("GCLOUD_PROJECT") +TEST_COLUMN_NAME = "zip_code" +TEST_TABLE_PROJECT_ID = "bigquery-public-data" +TEST_DATASET_ID = "san_francisco" +TEST_TABLE_ID = "bikeshare_trips" -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def test_job_name(): import google.cloud.dlp - dlp = google.cloud.dlp.DlpServiceClient() + + dlp = google.cloud.dlp_v2.DlpServiceClient() parent = dlp.project_path(GCLOUD_PROJECT) # Construct job request risk_job = { - 'privacy_metric': { - 'categorical_stats_config': { - 'field': { - 'name': TEST_COLUMN_NAME - } - } + "privacy_metric": { + "categorical_stats_config": {"field": {"name": TEST_COLUMN_NAME}} + }, + "source_table": { + "project_id": TEST_TABLE_PROJECT_ID, + "dataset_id": TEST_DATASET_ID, + "table_id": TEST_TABLE_ID, }, - 'source_table': { - 'project_id': TEST_TABLE_PROJECT_ID, - 'dataset_id': TEST_DATASET_ID, - 'table_id': TEST_TABLE_ID - } } response = dlp.create_dlp_job(parent, risk_job=risk_job) full_path = response.name # API expects only job name, not full project path - job_name = full_path[full_path.rfind('/')+1:] + job_name = full_path[full_path.rfind("/") + 1 :] return job_name @@ -59,21 +56,21 @@ def test_list_dlp_jobs(capsys): jobs.list_dlp_jobs(GCLOUD_PROJECT) out, _ = capsys.readouterr() - assert 'Job: projects/' in out + assert "Job: projects/" in out def test_list_dlp_jobs_with_filter(capsys): - jobs.list_dlp_jobs(GCLOUD_PROJECT, filter_string='state=DONE') + jobs.list_dlp_jobs(GCLOUD_PROJECT, filter_string="state=DONE") out, _ = capsys.readouterr() - assert 'Job: projects/' in out + assert "Job: projects/" in out def test_list_dlp_jobs_with_job_type(capsys): - jobs.list_dlp_jobs(GCLOUD_PROJECT, job_type='INSPECT_JOB') + jobs.list_dlp_jobs(GCLOUD_PROJECT, job_type="INSPECT_JOB") out, _ = capsys.readouterr() - assert 'Job: projects/' in out + assert "Job: projects/" in out def test_delete_dlp_job(test_job_name, capsys): diff --git a/dlp/metadata.py b/dlp/metadata.py index de05fa368529..81b8f5e08a48 100644 --- a/dlp/metadata.py +++ b/dlp/metadata.py @@ -34,30 +34,35 @@ def list_info_types(language_code=None, result_filter=None): import google.cloud.dlp # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Make the API call. response = dlp.list_info_types(language_code, result_filter) # Print the results to the console. - print('Info types:') + print("Info types:") for info_type in response.info_types: - print('{name}: {display_name}'.format( - name=info_type.name, display_name=info_type.display_name)) + print( + "{name}: {display_name}".format( + name=info_type.name, display_name=info_type.display_name + ) + ) + + # [END dlp_list_info_types] -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( - '--language_code', - help='The BCP-47 language code to use, e.g. \'en-US\'.') + "--language_code", help="The BCP-47 language code to use, e.g. 'en-US'." + ) parser.add_argument( - '--filter', - help='An optional filter to only return info types supported by ' - 'certain parts of the API. Defaults to "supported_by=INSPECT".') + "--filter", + help="An optional filter to only return info types supported by " + 'certain parts of the API. Defaults to "supported_by=INSPECT".', + ) args = parser.parse_args() - list_info_types( - language_code=args.language_code, result_filter=args.filter) + list_info_types(language_code=args.language_code, result_filter=args.filter) diff --git a/dlp/metadata_test.py b/dlp/metadata_test.py index a7e3bb9dccef..bde63fd3e8fb 100644 --- a/dlp/metadata_test.py +++ b/dlp/metadata_test.py @@ -19,4 +19,4 @@ def test_fetch_info_types(capsys): metadata.list_info_types() out, _ = capsys.readouterr() - assert 'EMAIL_ADDRESS' in out + assert "EMAIL_ADDRESS" in out diff --git a/dlp/quickstart.py b/dlp/quickstart.py index 736d59ddd8fe..2cc0f1442673 100644 --- a/dlp/quickstart.py +++ b/dlp/quickstart.py @@ -29,19 +29,19 @@ def quickstart(project_id): import google.cloud.dlp # Instantiate a client. - dlp_client = google.cloud.dlp.DlpServiceClient() + dlp_client = google.cloud.dlp_v2.DlpServiceClient() # The string to inspect - content = 'Robert Frost' + content = "Robert Frost" # Construct the item to inspect. - item = {'value': content} + item = {"value": content} # The info types to search for in the content. Required. - info_types = [{'name': 'FIRST_NAME'}, {'name': 'LAST_NAME'}] + info_types = [{"name": "FIRST_NAME"}, {"name": "LAST_NAME"}] # The minimum likelihood to constitute a match. Optional. - min_likelihood = 'LIKELIHOOD_UNSPECIFIED' + min_likelihood = "LIKELIHOOD_UNSPECIFIED" # The maximum number of findings to report (0 = server maximum). Optional. max_findings = 0 @@ -52,10 +52,10 @@ def quickstart(project_id): # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { - 'info_types': info_types, - 'min_likelihood': min_likelihood, - 'include_quote': include_quote, - 'limits': {'max_findings_per_request': max_findings}, + "info_types": info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, } # Convert the project id into a full resource id. @@ -68,25 +68,25 @@ def quickstart(project_id): if response.result.findings: for finding in response.result.findings: try: - print('Quote: {}'.format(finding.quote)) + print("Quote: {}".format(finding.quote)) except AttributeError: pass - print('Info type: {}'.format(finding.info_type.name)) + print("Info type: {}".format(finding.info_type.name)) # Convert likelihood value to string respresentation. - likelihood = (google.cloud.dlp.types.Finding.DESCRIPTOR - .fields_by_name['likelihood'] - .enum_type.values_by_number[finding.likelihood] - .name) - print('Likelihood: {}'.format(likelihood)) + likelihood = ( + google.cloud.dlp.types.Finding.DESCRIPTOR.fields_by_name["likelihood"] + .enum_type.values_by_number[finding.likelihood] + .name + ) + print("Likelihood: {}".format(likelihood)) else: - print('No findings.') + print("No findings.") # [END dlp_quickstart] -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument( - "project_id", help="Enter your GCP project id.", type=str) + parser.add_argument("project_id", help="Enter your GCP project id.", type=str) args = parser.parse_args() if len(sys.argv) == 1: parser.print_usage() diff --git a/dlp/quickstart_test.py b/dlp/quickstart_test.py index 19c215fdbb00..72ddd2978030 100644 --- a/dlp/quickstart_test.py +++ b/dlp/quickstart_test.py @@ -20,17 +20,18 @@ import quickstart -GCLOUD_PROJECT = os.getenv('GCLOUD_PROJECT') +GCLOUD_PROJECT = os.getenv("GCLOUD_PROJECT") def test_quickstart(capsys): # Mock out project_path to use the test runner's project ID. with mock.patch.object( - google.cloud.dlp.DlpServiceClient, - 'project_path', - return_value='projects/{}'.format(GCLOUD_PROJECT)): + google.cloud.dlp.DlpServiceClient, + "project_path", + return_value="projects/{}".format(GCLOUD_PROJECT), + ): quickstart.quickstart(GCLOUD_PROJECT) out, _ = capsys.readouterr() - assert 'FIRST_NAME' in out - assert 'LAST_NAME' in out + assert "FIRST_NAME" in out + assert "LAST_NAME" in out diff --git a/dlp/redact.py b/dlp/redact.py index 22ed77fa4dba..e3ff08ec65ed 100644 --- a/dlp/redact.py +++ b/dlp/redact.py @@ -18,16 +18,20 @@ from __future__ import print_function import argparse + # [START dlp_redact_image] import mimetypes + # [END dlp_redact_image] import os # [START dlp_redact_image] -def redact_image(project, filename, output_filename, - info_types, min_likelihood=None, mime_type=None): + +def redact_image( + project, filename, output_filename, info_types, min_likelihood=None, mime_type=None +): """Uses the Data Loss Prevention API to redact protected data in an image. Args: project: The Google Cloud project id to use as a parent resource. @@ -47,11 +51,11 @@ def redact_image(project, filename, output_filename, import google.cloud.dlp # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). - info_types = [{'name': info_type} for info_type in info_types] + info_types = [{"name": info_type} for info_type in info_types] # Prepare image_redaction_configs, a list of dictionaries. Each dictionary # contains an info_type and optionally the color used for the replacement. @@ -60,87 +64,105 @@ def redact_image(project, filename, output_filename, if info_types is not None: for info_type in info_types: - image_redaction_configs.append({'info_type': info_type}) + image_redaction_configs.append({"info_type": info_type}) # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. - inspect_config = { - 'min_likelihood': min_likelihood, - 'info_types': info_types, - } + inspect_config = {"min_likelihood": min_likelihood, "info_types": info_types} # If mime_type is not specified, guess it from the filename. if mime_type is None: mime_guess = mimetypes.MimeTypes().guess_type(filename) - mime_type = mime_guess[0] or 'application/octet-stream' + mime_type = mime_guess[0] or "application/octet-stream" # Select the content type index from the list of supported types. supported_content_types = { None: 0, # "Unspecified" - 'image/jpeg': 1, - 'image/bmp': 2, - 'image/png': 3, - 'image/svg': 4, - 'text/plain': 5, + "image/jpeg": 1, + "image/bmp": 2, + "image/png": 3, + "image/svg": 4, + "text/plain": 5, } content_type_index = supported_content_types.get(mime_type, 0) # Construct the byte_item, containing the file's byte data. - with open(filename, mode='rb') as f: - byte_item = {'type': content_type_index, 'data': f.read()} + with open(filename, mode="rb") as f: + byte_item = {"type": content_type_index, "data": f.read()} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.redact_image( - parent, inspect_config=inspect_config, + parent, + inspect_config=inspect_config, image_redaction_configs=image_redaction_configs, - byte_item=byte_item) + byte_item=byte_item, + ) # Write out the results. - with open(output_filename, mode='wb') as f: + with open(output_filename, mode="wb") as f: f.write(response.redacted_image) - print("Wrote {byte_count} to {filename}".format( - byte_count=len(response.redacted_image), filename=output_filename)) + print( + "Wrote {byte_count} to {filename}".format( + byte_count=len(response.redacted_image), filename=output_filename + ) + ) + + # [END dlp_redact_image] -if __name__ == '__main__': - default_project = os.environ.get('GCLOUD_PROJECT') +if __name__ == "__main__": + default_project = os.environ.get("GCLOUD_PROJECT") parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("filename", help="The path to the file to inspect.") parser.add_argument( - 'filename', help='The path to the file to inspect.') - parser.add_argument( - 'output_filename', - help='The path to which the redacted image will be written.') + "output_filename", help="The path to which the redacted image will be written." + ) parser.add_argument( - '--project', - help='The Google Cloud project id to use as a parent resource.', - default=default_project) + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) parser.add_argument( - '--info_types', nargs='+', - help='Strings representing info types to look for. A full list of ' - 'info categories and types is available from the API. Examples ' - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - 'If unspecified, the three above examples will be used.', - default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) parser.add_argument( - '--min_likelihood', - choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', - 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], - help='A string representing the minimum likelihood threshold that ' - 'constitutes a match.') + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) parser.add_argument( - '--mime_type', - help='The MIME type of the file. If not specified, the type is ' - 'inferred via the Python standard library\'s mimetypes module.') + "--mime_type", + help="The MIME type of the file. If not specified, the type is " + "inferred via the Python standard library's mimetypes module.", + ) args = parser.parse_args() redact_image( - args.project, args.filename, args.output_filename, - args.info_types, min_likelihood=args.min_likelihood, - mime_type=args.mime_type) + args.project, + args.filename, + args.output_filename, + args.info_types, + min_likelihood=args.min_likelihood, + mime_type=args.mime_type, + ) diff --git a/dlp/redact_test.py b/dlp/redact_test.py index 50eb826b051e..39875551b126 100644 --- a/dlp/redact_test.py +++ b/dlp/redact_test.py @@ -20,11 +20,11 @@ import redact -GCLOUD_PROJECT = os.getenv('GCLOUD_PROJECT') -RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), 'resources') +GCLOUD_PROJECT = os.getenv("GCLOUD_PROJECT") +RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources") -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def tempdir(): tempdir = tempfile.mkdtemp() yield tempdir @@ -32,14 +32,12 @@ def tempdir(): def test_redact_image_file(tempdir, capsys): - test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png') - output_filepath = os.path.join(tempdir, 'redacted.png') + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") + output_filepath = os.path.join(tempdir, "redacted.png") redact.redact_image( - GCLOUD_PROJECT, - test_filepath, - output_filepath, - ['FIRST_NAME', 'EMAIL_ADDRESS']) + GCLOUD_PROJECT, test_filepath, output_filepath, ["FIRST_NAME", "EMAIL_ADDRESS"] + ) out, _ = capsys.readouterr() assert output_filepath in out diff --git a/dlp/risk.py b/dlp/risk.py index 273cfd1548da..272d29768dc1 100644 --- a/dlp/risk.py +++ b/dlp/risk.py @@ -20,9 +20,16 @@ # [START dlp_numerical_stats] -def numerical_risk_analysis(project, table_project_id, dataset_id, table_id, - column_name, topic_id, subscription_id, - timeout=300): +def numerical_risk_analysis( + project, + table_project_id, + dataset_id, + table_id, + column_name, + topic_id, + subscription_id, + timeout=300, +): """Uses the Data Loss Prevention API to compute risk metrics of a column of numerical data in a Google BigQuery table. Args: @@ -50,22 +57,23 @@ def numerical_risk_analysis(project, table_project_id, dataset_id, table_id, import google.cloud.pubsub def callback(message): - if (message.attributes['DlpJobName'] == operation.name): + if message.attributes["DlpJobName"] == operation.name: # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) results = job.risk_details.numerical_stats_result - print('Value Range: [{}, {}]'.format( - results.min_value.integer_value, - results.max_value.integer_value)) + print( + "Value Range: [{}, {}]".format( + results.min_value.integer_value, results.max_value.integer_value + ) + ) prev_value = None for percent, result in enumerate(results.quantile_values): value = result.integer_value if prev_value != value: - print('Value at {}% quantile: {}'.format( - percent, value)) + print("Value at {}% quantile: {}".format(percent, value)) prev_value = value subscription.set_result(None) else: @@ -73,42 +81,33 @@ def callback(message): message.drop() # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Location info of the BigQuery table. source_table = { - 'project_id': table_project_id, - 'dataset_id': dataset_id, - 'table_id': table_id + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, } # Tell the API where to send a notification when the job is complete. - actions = [{ - 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} - }] + actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for risk_job = { - 'privacy_metric': { - 'numerical_stats_config': { - 'field': { - 'name': column_name - } - } - }, - 'source_table': source_table, - 'actions': actions + "privacy_metric": {"numerical_stats_config": {"field": {"name": column_name}}}, + "source_table": source_table, + "actions": actions, } # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path( - project, subscription_id) + subscription_path = subscriber.subscription_path(project, subscription_id) subscription = subscriber.subscribe(subscription_path, callback) # Call API to start risk analysis job @@ -117,16 +116,27 @@ def callback(message): try: subscription.result(timeout=timeout) except TimeoutError: - print('No event received before the timeout. Please verify that the ' - 'subscription provided is subscribed to the topic provided.') + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) subscription.close() + + # [END dlp_numerical_stats] # [START dlp_categorical_stats] -def categorical_risk_analysis(project, table_project_id, dataset_id, table_id, - column_name, topic_id, subscription_id, - timeout=300): +def categorical_risk_analysis( + project, + table_project_id, + dataset_id, + table_id, + column_name, + topic_id, + subscription_id, + timeout=300, +): """Uses the Data Loss Prevention API to compute risk metrics of a column of categorical data in a Google BigQuery table. Args: @@ -154,69 +164,70 @@ def categorical_risk_analysis(project, table_project_id, dataset_id, table_id, import google.cloud.pubsub def callback(message): - if (message.attributes['DlpJobName'] == operation.name): + if message.attributes["DlpJobName"] == operation.name: # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) - histogram_buckets = (job.risk_details - .categorical_stats_result - .value_frequency_histogram_buckets) + histogram_buckets = ( + job.risk_details.categorical_stats_result.value_frequency_histogram_buckets + ) # Print bucket stats for i, bucket in enumerate(histogram_buckets): - print('Bucket {}:'.format(i)) - print(' Most common value occurs {} time(s)'.format( - bucket.value_frequency_upper_bound)) - print(' Least common value occurs {} time(s)'.format( - bucket.value_frequency_lower_bound)) - print(' {} unique values total.'.format( - bucket.bucket_size)) + print("Bucket {}:".format(i)) + print( + " Most common value occurs {} time(s)".format( + bucket.value_frequency_upper_bound + ) + ) + print( + " Least common value occurs {} time(s)".format( + bucket.value_frequency_lower_bound + ) + ) + print(" {} unique values total.".format(bucket.bucket_size)) for value in bucket.bucket_values: - print(' Value {} occurs {} time(s)'.format( - value.value.integer_value, value.count)) + print( + " Value {} occurs {} time(s)".format( + value.value.integer_value, value.count + ) + ) subscription.set_result(None) else: # This is not the message we're looking for. message.drop() # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Location info of the BigQuery table. source_table = { - 'project_id': table_project_id, - 'dataset_id': dataset_id, - 'table_id': table_id + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, } # Tell the API where to send a notification when the job is complete. - actions = [{ - 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} - }] + actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for risk_job = { - 'privacy_metric': { - 'categorical_stats_config': { - 'field': { - 'name': column_name - } - } + "privacy_metric": { + "categorical_stats_config": {"field": {"name": column_name}} }, - 'source_table': source_table, - 'actions': actions + "source_table": source_table, + "actions": actions, } # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path( - project, subscription_id) + subscription_path = subscriber.subscription_path(project, subscription_id) subscription = subscriber.subscribe(subscription_path, callback) # Call API to start risk analysis job @@ -225,15 +236,27 @@ def callback(message): try: subscription.result(timeout=timeout) except TimeoutError: - print('No event received before the timeout. Please verify that the ' - 'subscription provided is subscribed to the topic provided.') + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) subscription.close() + + # [END dlp_categorical_stats] # [START dlp_k_anonymity] -def k_anonymity_analysis(project, table_project_id, dataset_id, table_id, - topic_id, subscription_id, quasi_ids, timeout=300): +def k_anonymity_analysis( + project, + table_project_id, + dataset_id, + table_id, + topic_id, + subscription_id, + quasi_ids, + timeout=300, +): """Uses the Data Loss Prevention API to compute the k-anonymity of a column set in a Google BigQuery table. Args: @@ -265,74 +288,75 @@ def get_values(obj): return int(obj.integer_value) def callback(message): - if (message.attributes['DlpJobName'] == operation.name): + if message.attributes["DlpJobName"] == operation.name: # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) - histogram_buckets = (job.risk_details - .k_anonymity_result - .equivalence_class_histogram_buckets) + histogram_buckets = ( + job.risk_details.k_anonymity_result.equivalence_class_histogram_buckets + ) # Print bucket stats for i, bucket in enumerate(histogram_buckets): - print('Bucket {}:'.format(i)) + print("Bucket {}:".format(i)) if bucket.equivalence_class_size_lower_bound: - print(' Bucket size range: [{}, {}]'.format( - bucket.equivalence_class_size_lower_bound, - bucket.equivalence_class_size_upper_bound)) + print( + " Bucket size range: [{}, {}]".format( + bucket.equivalence_class_size_lower_bound, + bucket.equivalence_class_size_upper_bound, + ) + ) for value_bucket in bucket.bucket_values: - print(' Quasi-ID values: {}'.format( - map(get_values, value_bucket.quasi_ids_values) - )) - print(' Class size: {}'.format( - value_bucket.equivalence_class_size)) + print( + " Quasi-ID values: {}".format( + map(get_values, value_bucket.quasi_ids_values) + ) + ) + print( + " Class size: {}".format( + value_bucket.equivalence_class_size + ) + ) subscription.set_result(None) else: # This is not the message we're looking for. message.drop() # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Location info of the BigQuery table. source_table = { - 'project_id': table_project_id, - 'dataset_id': dataset_id, - 'table_id': table_id + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, } # Convert quasi id list to Protobuf type def map_fields(field): - return {'name': field} + return {"name": field} quasi_ids = map(map_fields, quasi_ids) # Tell the API where to send a notification when the job is complete. - actions = [{ - 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} - }] + actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for risk_job = { - 'privacy_metric': { - 'k_anonymity_config': { - 'quasi_ids': quasi_ids - } - }, - 'source_table': source_table, - 'actions': actions + "privacy_metric": {"k_anonymity_config": {"quasi_ids": quasi_ids}}, + "source_table": source_table, + "actions": actions, } # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path( - project, subscription_id) + subscription_path = subscriber.subscription_path(project, subscription_id) subscription = subscriber.subscribe(subscription_path, callback) # Call API to start risk analysis job @@ -341,16 +365,28 @@ def map_fields(field): try: subscription.result(timeout=timeout) except TimeoutError: - print('No event received before the timeout. Please verify that the ' - 'subscription provided is subscribed to the topic provided.') + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) subscription.close() + + # [END dlp_k_anonymity] # [START dlp_l_diversity] -def l_diversity_analysis(project, table_project_id, dataset_id, table_id, - topic_id, subscription_id, sensitive_attribute, - quasi_ids, timeout=300): +def l_diversity_analysis( + project, + table_project_id, + dataset_id, + table_id, + topic_id, + subscription_id, + sensitive_attribute, + quasi_ids, + timeout=300, +): """Uses the Data Loss Prevention API to compute the l-diversity of a column set in a Google BigQuery table. Args: @@ -383,79 +419,85 @@ def get_values(obj): return int(obj.integer_value) def callback(message): - if (message.attributes['DlpJobName'] == operation.name): + if message.attributes["DlpJobName"] == operation.name: # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) histogram_buckets = ( - job.risk_details - .l_diversity_result - .sensitive_value_frequency_histogram_buckets) + job.risk_details.l_diversity_result.sensitive_value_frequency_histogram_buckets + ) # Print bucket stats for i, bucket in enumerate(histogram_buckets): - print('Bucket {}:'.format(i)) - print(' Bucket size range: [{}, {}]'.format( - bucket.sensitive_value_frequency_lower_bound, - bucket.sensitive_value_frequency_upper_bound)) + print("Bucket {}:".format(i)) + print( + " Bucket size range: [{}, {}]".format( + bucket.sensitive_value_frequency_lower_bound, + bucket.sensitive_value_frequency_upper_bound, + ) + ) for value_bucket in bucket.bucket_values: - print(' Quasi-ID values: {}'.format( - map(get_values, value_bucket.quasi_ids_values))) - print(' Class size: {}'.format( - value_bucket.equivalence_class_size)) + print( + " Quasi-ID values: {}".format( + map(get_values, value_bucket.quasi_ids_values) + ) + ) + print( + " Class size: {}".format(value_bucket.equivalence_class_size) + ) for value in value_bucket.top_sensitive_values: - print((' Sensitive value {} occurs {} time(s)' - .format(value.value, value.count))) + print( + ( + " Sensitive value {} occurs {} time(s)".format( + value.value, value.count + ) + ) + ) subscription.set_result(None) else: # This is not the message we're looking for. message.drop() # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Location info of the BigQuery table. source_table = { - 'project_id': table_project_id, - 'dataset_id': dataset_id, - 'table_id': table_id + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, } # Convert quasi id list to Protobuf type def map_fields(field): - return {'name': field} + return {"name": field} quasi_ids = map(map_fields, quasi_ids) # Tell the API where to send a notification when the job is complete. - actions = [{ - 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} - }] + actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for risk_job = { - 'privacy_metric': { - 'l_diversity_config': { - 'quasi_ids': quasi_ids, - 'sensitive_attribute': { - 'name': sensitive_attribute - } + "privacy_metric": { + "l_diversity_config": { + "quasi_ids": quasi_ids, + "sensitive_attribute": {"name": sensitive_attribute}, } }, - 'source_table': source_table, - 'actions': actions + "source_table": source_table, + "actions": actions, } # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path( - project, subscription_id) + subscription_path = subscriber.subscription_path(project, subscription_id) subscription = subscriber.subscribe(subscription_path, callback) # Call API to start risk analysis job @@ -464,16 +506,29 @@ def map_fields(field): try: subscription.result(timeout=timeout) except TimeoutError: - print('No event received before the timeout. Please verify that the ' - 'subscription provided is subscribed to the topic provided.') + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) subscription.close() + + # [END dlp_l_diversity] # [START dlp_k_map] -def k_map_estimate_analysis(project, table_project_id, dataset_id, table_id, - topic_id, subscription_id, quasi_ids, info_types, - region_code='US', timeout=300): +def k_map_estimate_analysis( + project, + table_project_id, + dataset_id, + table_id, + topic_id, + subscription_id, + quasi_ids, + info_types, + region_code="US", + timeout=300, +): """Uses the Data Loss Prevention API to compute the k-map risk estimation of a column set in a Google BigQuery table. Args: @@ -512,78 +567,86 @@ def get_values(obj): return int(obj.integer_value) def callback(message): - if (message.attributes['DlpJobName'] == operation.name): + if message.attributes["DlpJobName"] == operation.name: # This is the message we're looking for, so acknowledge it. message.ack() # Now that the job is done, fetch the results and print them. job = dlp.get_dlp_job(operation.name) - histogram_buckets = (job.risk_details - .k_map_estimation_result - .k_map_estimation_histogram) + histogram_buckets = ( + job.risk_details.k_map_estimation_result.k_map_estimation_histogram + ) # Print bucket stats for i, bucket in enumerate(histogram_buckets): - print('Bucket {}:'.format(i)) - print(' Anonymity range: [{}, {}]'.format( - bucket.min_anonymity, bucket.max_anonymity)) - print(' Size: {}'.format(bucket.bucket_size)) + print("Bucket {}:".format(i)) + print( + " Anonymity range: [{}, {}]".format( + bucket.min_anonymity, bucket.max_anonymity + ) + ) + print(" Size: {}".format(bucket.bucket_size)) for value_bucket in bucket.bucket_values: - print(' Values: {}'.format( - map(get_values, value_bucket.quasi_ids_values))) - print(' Estimated k-map anonymity: {}'.format( - value_bucket.estimated_anonymity)) + print( + " Values: {}".format( + map(get_values, value_bucket.quasi_ids_values) + ) + ) + print( + " Estimated k-map anonymity: {}".format( + value_bucket.estimated_anonymity + ) + ) subscription.set_result(None) else: # This is not the message we're looking for. message.drop() # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Location info of the BigQuery table. source_table = { - 'project_id': table_project_id, - 'dataset_id': dataset_id, - 'table_id': table_id + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, } # Check that numbers of quasi-ids and info types are equal if len(quasi_ids) != len(info_types): - raise ValueError("""Number of infoTypes and number of quasi-identifiers - must be equal!""") + raise ValueError( + """Number of infoTypes and number of quasi-identifiers + must be equal!""" + ) # Convert quasi id list to Protobuf type def map_fields(quasi_id, info_type): - return {'field': {'name': quasi_id}, 'info_type': {'name': info_type}} + return {"field": {"name": quasi_id}, "info_type": {"name": info_type}} quasi_ids = map(map_fields, quasi_ids, info_types) # Tell the API where to send a notification when the job is complete. - actions = [{ - 'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)} - }] + actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for risk_job = { - 'privacy_metric': { - 'k_map_estimation_config': { - 'quasi_ids': quasi_ids, - 'region_code': region_code + "privacy_metric": { + "k_map_estimation_config": { + "quasi_ids": quasi_ids, + "region_code": region_code, } }, - 'source_table': source_table, - 'actions': actions + "source_table": source_table, + "actions": actions, } # Create a Pub/Sub client and find the subscription. The subscription is # expected to already be listening to the topic. subscriber = google.cloud.pubsub.SubscriberClient() - subscription_path = subscriber.subscription_path( - project, subscription_id) + subscription_path = subscriber.subscription_path(project, subscription_id) subscription = subscriber.subscribe(subscription_path, callback) # Call API to start risk analysis job @@ -592,180 +655,201 @@ def map_fields(quasi_id, info_type): try: subscription.result(timeout=timeout) except TimeoutError: - print('No event received before the timeout. Please verify that the ' - 'subscription provided is subscribed to the topic provided.') + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) subscription.close() + + # [END dlp_k_map] -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) subparsers = parser.add_subparsers( - dest='content', help='Select how to submit content to the API.') + dest="content", help="Select how to submit content to the API." + ) subparsers.required = True - numerical_parser = subparsers.add_parser( - 'numerical', - help='') + numerical_parser = subparsers.add_parser("numerical", help="") numerical_parser.add_argument( - 'project', - help='The Google Cloud project id to use as a parent resource.') + "project", help="The Google Cloud project id to use as a parent resource." + ) numerical_parser.add_argument( - 'table_project_id', - help='The Google Cloud project id where the BigQuery table is stored.') + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) numerical_parser.add_argument( - 'dataset_id', - help='The id of the dataset to inspect.') + "dataset_id", help="The id of the dataset to inspect." + ) + numerical_parser.add_argument("table_id", help="The id of the table to inspect.") numerical_parser.add_argument( - 'table_id', - help='The id of the table to inspect.') + "column_name", help="The name of the column to compute risk metrics for." + ) numerical_parser.add_argument( - 'column_name', - help='The name of the column to compute risk metrics for.') + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) numerical_parser.add_argument( - 'topic_id', - help='The name of the Pub/Sub topic to notify once the job completes.') + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) numerical_parser.add_argument( - 'subscription_id', - help='The name of the Pub/Sub subscription to use when listening for' - 'job completion notifications.') - numerical_parser.add_argument( - '--timeout', type=int, - help='The number of seconds to wait for a response from the API.') + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) - categorical_parser = subparsers.add_parser( - 'categorical', - help='') - categorical_parser.add_argument( - 'project', - help='The Google Cloud project id to use as a parent resource.') + categorical_parser = subparsers.add_parser("categorical", help="") categorical_parser.add_argument( - 'table_project_id', - help='The Google Cloud project id where the BigQuery table is stored.') + "project", help="The Google Cloud project id to use as a parent resource." + ) categorical_parser.add_argument( - 'dataset_id', - help='The id of the dataset to inspect.') + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) categorical_parser.add_argument( - 'table_id', - help='The id of the table to inspect.') + "dataset_id", help="The id of the dataset to inspect." + ) + categorical_parser.add_argument("table_id", help="The id of the table to inspect.") categorical_parser.add_argument( - 'column_name', - help='The name of the column to compute risk metrics for.') + "column_name", help="The name of the column to compute risk metrics for." + ) categorical_parser.add_argument( - 'topic_id', - help='The name of the Pub/Sub topic to notify once the job completes.') + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) categorical_parser.add_argument( - 'subscription_id', - help='The name of the Pub/Sub subscription to use when listening for' - 'job completion notifications.') + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) categorical_parser.add_argument( - '--timeout', type=int, - help='The number of seconds to wait for a response from the API.') + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) k_anonymity_parser = subparsers.add_parser( - 'k_anonymity', - help='Computes the k-anonymity of a column set in a Google BigQuery' - 'table.') - k_anonymity_parser.add_argument( - 'project', - help='The Google Cloud project id to use as a parent resource.') + "k_anonymity", + help="Computes the k-anonymity of a column set in a Google BigQuery" "table.", + ) k_anonymity_parser.add_argument( - 'table_project_id', - help='The Google Cloud project id where the BigQuery table is stored.') + "project", help="The Google Cloud project id to use as a parent resource." + ) k_anonymity_parser.add_argument( - 'dataset_id', - help='The id of the dataset to inspect.') + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) k_anonymity_parser.add_argument( - 'table_id', - help='The id of the table to inspect.') + "dataset_id", help="The id of the dataset to inspect." + ) + k_anonymity_parser.add_argument("table_id", help="The id of the table to inspect.") k_anonymity_parser.add_argument( - 'topic_id', - help='The name of the Pub/Sub topic to notify once the job completes.') + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) k_anonymity_parser.add_argument( - 'subscription_id', - help='The name of the Pub/Sub subscription to use when listening for' - 'job completion notifications.') + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) k_anonymity_parser.add_argument( - 'quasi_ids', nargs='+', - help='A set of columns that form a composite key.') + "quasi_ids", nargs="+", help="A set of columns that form a composite key." + ) k_anonymity_parser.add_argument( - '--timeout', type=int, - help='The number of seconds to wait for a response from the API.') + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) l_diversity_parser = subparsers.add_parser( - 'l_diversity', - help='Computes the l-diversity of a column set in a Google BigQuery' - 'table.') + "l_diversity", + help="Computes the l-diversity of a column set in a Google BigQuery" "table.", + ) l_diversity_parser.add_argument( - 'project', - help='The Google Cloud project id to use as a parent resource.') + "project", help="The Google Cloud project id to use as a parent resource." + ) l_diversity_parser.add_argument( - 'table_project_id', - help='The Google Cloud project id where the BigQuery table is stored.') + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) l_diversity_parser.add_argument( - 'dataset_id', - help='The id of the dataset to inspect.') + "dataset_id", help="The id of the dataset to inspect." + ) + l_diversity_parser.add_argument("table_id", help="The id of the table to inspect.") l_diversity_parser.add_argument( - 'table_id', - help='The id of the table to inspect.') + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) l_diversity_parser.add_argument( - 'topic_id', - help='The name of the Pub/Sub topic to notify once the job completes.') + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) l_diversity_parser.add_argument( - 'subscription_id', - help='The name of the Pub/Sub subscription to use when listening for' - 'job completion notifications.') + "sensitive_attribute", help="The column to measure l-diversity relative to." + ) l_diversity_parser.add_argument( - 'sensitive_attribute', - help='The column to measure l-diversity relative to.') + "quasi_ids", nargs="+", help="A set of columns that form a composite key." + ) l_diversity_parser.add_argument( - 'quasi_ids', nargs='+', - help='A set of columns that form a composite key.') - l_diversity_parser.add_argument( - '--timeout', type=int, - help='The number of seconds to wait for a response from the API.') + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) k_map_parser = subparsers.add_parser( - 'k_map', - help='Computes the k-map risk estimation of a column set in a Google' - 'BigQuery table.') - k_map_parser.add_argument( - 'project', - help='The Google Cloud project id to use as a parent resource.') - k_map_parser.add_argument( - 'table_project_id', - help='The Google Cloud project id where the BigQuery table is stored.') + "k_map", + help="Computes the k-map risk estimation of a column set in a Google" + "BigQuery table.", + ) k_map_parser.add_argument( - 'dataset_id', - help='The id of the dataset to inspect.') + "project", help="The Google Cloud project id to use as a parent resource." + ) k_map_parser.add_argument( - 'table_id', - help='The id of the table to inspect.') + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + k_map_parser.add_argument("dataset_id", help="The id of the dataset to inspect.") + k_map_parser.add_argument("table_id", help="The id of the table to inspect.") k_map_parser.add_argument( - 'topic_id', - help='The name of the Pub/Sub topic to notify once the job completes.') + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) k_map_parser.add_argument( - 'subscription_id', - help='The name of the Pub/Sub subscription to use when listening for' - 'job completion notifications.') + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) k_map_parser.add_argument( - 'quasi_ids', nargs='+', - help='A set of columns that form a composite key.') + "quasi_ids", nargs="+", help="A set of columns that form a composite key." + ) k_map_parser.add_argument( - '-t', '--info-types', nargs='+', - help='Type of information of the quasi_id in order to provide a' - 'statistical model of population.', - required=True) + "-t", + "--info-types", + nargs="+", + help="Type of information of the quasi_id in order to provide a" + "statistical model of population.", + required=True, + ) k_map_parser.add_argument( - '-r', '--region-code', default='US', - help='The ISO 3166-1 region code that the data is representative of.') + "-r", + "--region-code", + default="US", + help="The ISO 3166-1 region code that the data is representative of.", + ) k_map_parser.add_argument( - '--timeout', type=int, - help='The number of seconds to wait for a response from the API.') + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) args = parser.parse_args() - if args.content == 'numerical': + if args.content == "numerical": numerical_risk_analysis( args.project, args.table_project_id, @@ -774,8 +858,9 @@ def map_fields(quasi_id, info_type): args.column_name, args.topic_id, args.subscription_id, - timeout=args.timeout) - elif args.content == 'categorical': + timeout=args.timeout, + ) + elif args.content == "categorical": categorical_risk_analysis( args.project, args.table_project_id, @@ -784,8 +869,9 @@ def map_fields(quasi_id, info_type): args.column_name, args.topic_id, args.subscription_id, - timeout=args.timeout) - elif args.content == 'k_anonymity': + timeout=args.timeout, + ) + elif args.content == "k_anonymity": k_anonymity_analysis( args.project, args.table_project_id, @@ -794,8 +880,9 @@ def map_fields(quasi_id, info_type): args.topic_id, args.subscription_id, args.quasi_ids, - timeout=args.timeout) - elif args.content == 'l_diversity': + timeout=args.timeout, + ) + elif args.content == "l_diversity": l_diversity_analysis( args.project, args.table_project_id, @@ -805,8 +892,9 @@ def map_fields(quasi_id, info_type): args.subscription_id, args.sensitive_attribute, args.quasi_ids, - timeout=args.timeout) - elif args.content == 'k_map': + timeout=args.timeout, + ) + elif args.content == "k_map": k_map_estimate_analysis( args.project, args.table_project_id, @@ -817,4 +905,5 @@ def map_fields(quasi_id, info_type): args.quasi_ids, args.info_types, region_code=args.region_code, - timeout=args.timeout) + timeout=args.timeout, + ) diff --git a/dlp/risk_test.py b/dlp/risk_test.py index d89e83bcb499..dafb58523bce 100644 --- a/dlp/risk_test.py +++ b/dlp/risk_test.py @@ -19,19 +19,19 @@ import risk -GCLOUD_PROJECT = 'nodejs-docs-samples' -TABLE_PROJECT = 'nodejs-docs-samples' -TOPIC_ID = 'dlp-test' -SUBSCRIPTION_ID = 'dlp-test-subscription' -DATASET_ID = 'integration_tests_dlp' -UNIQUE_FIELD = 'Name' -REPEATED_FIELD = 'Mystery' -NUMERIC_FIELD = 'Age' -STRING_BOOLEAN_FIELD = 'Gender' +GCLOUD_PROJECT = "python-docs-samples" +TABLE_PROJECT = "python-docs-samples" +TOPIC_ID = "dlp-test" +SUBSCRIPTION_ID = "dlp-test-subscription" +DATASET_ID = "integration_tests_dlp" +UNIQUE_FIELD = "Name" +REPEATED_FIELD = "Mystery" +NUMERIC_FIELD = "Age" +STRING_BOOLEAN_FIELD = "Gender" # Create new custom topic/subscription -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def topic_id(): # Creates a pubsub topic, and tears it down. publisher = google.cloud.pubsub.PublisherClient() @@ -46,13 +46,12 @@ def topic_id(): publisher.delete_topic(topic_path) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def subscription_id(topic_id): # Subscribes to a topic. subscriber = google.cloud.pubsub.SubscriberClient() topic_path = subscriber.topic_path(GCLOUD_PROJECT, topic_id) - subscription_path = subscriber.subscription_path( - GCLOUD_PROJECT, SUBSCRIPTION_ID) + subscription_path = subscriber.subscription_path(GCLOUD_PROJECT, SUBSCRIPTION_ID) try: subscriber.create_subscription(subscription_path, topic_path) except google.api_core.exceptions.AlreadyExists: @@ -69,45 +68,47 @@ def test_numerical_risk_analysis(topic_id, subscription_id, capsys): GCLOUD_PROJECT, TABLE_PROJECT, DATASET_ID, - 'harmful', + "harmful", NUMERIC_FIELD, topic_id, - subscription_id) + subscription_id, + ) out, _ = capsys.readouterr() - assert 'Value Range:' in out + assert "Value Range:" in out @flaky -def test_categorical_risk_analysis_on_string_field( - topic_id, subscription_id, capsys): +def test_categorical_risk_analysis_on_string_field(topic_id, subscription_id, capsys): risk.categorical_risk_analysis( GCLOUD_PROJECT, TABLE_PROJECT, DATASET_ID, - 'harmful', + "harmful", UNIQUE_FIELD, topic_id, - subscription_id, timeout=180) + subscription_id, + timeout=180, + ) out, _ = capsys.readouterr() - assert 'Most common value occurs' in out + assert "Most common value occurs" in out @flaky -def test_categorical_risk_analysis_on_number_field( - topic_id, subscription_id, capsys): +def test_categorical_risk_analysis_on_number_field(topic_id, subscription_id, capsys): risk.categorical_risk_analysis( GCLOUD_PROJECT, TABLE_PROJECT, DATASET_ID, - 'harmful', + "harmful", NUMERIC_FIELD, topic_id, - subscription_id) + subscription_id, + ) out, _ = capsys.readouterr() - assert 'Most common value occurs' in out + assert "Most common value occurs" in out @flaky @@ -116,31 +117,32 @@ def test_k_anonymity_analysis_single_field(topic_id, subscription_id, capsys): GCLOUD_PROJECT, TABLE_PROJECT, DATASET_ID, - 'harmful', + "harmful", topic_id, subscription_id, - [NUMERIC_FIELD]) + [NUMERIC_FIELD], + ) out, _ = capsys.readouterr() - assert 'Quasi-ID values:' in out - assert 'Class size:' in out + assert "Quasi-ID values:" in out + assert "Class size:" in out @flaky -def test_k_anonymity_analysis_multiple_fields(topic_id, subscription_id, - capsys): +def test_k_anonymity_analysis_multiple_fields(topic_id, subscription_id, capsys): risk.k_anonymity_analysis( GCLOUD_PROJECT, TABLE_PROJECT, DATASET_ID, - 'harmful', + "harmful", topic_id, subscription_id, - [NUMERIC_FIELD, REPEATED_FIELD]) + [NUMERIC_FIELD, REPEATED_FIELD], + ) out, _ = capsys.readouterr() - assert 'Quasi-ID values:' in out - assert 'Class size:' in out + assert "Quasi-ID values:" in out + assert "Class size:" in out @flaky @@ -149,85 +151,86 @@ def test_l_diversity_analysis_single_field(topic_id, subscription_id, capsys): GCLOUD_PROJECT, TABLE_PROJECT, DATASET_ID, - 'harmful', + "harmful", topic_id, subscription_id, UNIQUE_FIELD, - [NUMERIC_FIELD]) + [NUMERIC_FIELD], + ) out, _ = capsys.readouterr() - assert 'Quasi-ID values:' in out - assert 'Class size:' in out - assert 'Sensitive value' in out + assert "Quasi-ID values:" in out + assert "Class size:" in out + assert "Sensitive value" in out @flaky -def test_l_diversity_analysis_multiple_field( - topic_id, subscription_id, capsys): +def test_l_diversity_analysis_multiple_field(topic_id, subscription_id, capsys): risk.l_diversity_analysis( GCLOUD_PROJECT, TABLE_PROJECT, DATASET_ID, - 'harmful', + "harmful", topic_id, subscription_id, UNIQUE_FIELD, - [NUMERIC_FIELD, REPEATED_FIELD]) + [NUMERIC_FIELD, REPEATED_FIELD], + ) out, _ = capsys.readouterr() - assert 'Quasi-ID values:' in out - assert 'Class size:' in out - assert 'Sensitive value' in out + assert "Quasi-ID values:" in out + assert "Class size:" in out + assert "Sensitive value" in out @flaky -def test_k_map_estimate_analysis_single_field( - topic_id, subscription_id, capsys): +def test_k_map_estimate_analysis_single_field(topic_id, subscription_id, capsys): risk.k_map_estimate_analysis( GCLOUD_PROJECT, TABLE_PROJECT, DATASET_ID, - 'harmful', + "harmful", topic_id, subscription_id, [NUMERIC_FIELD], - ['AGE']) + ["AGE"], + ) out, _ = capsys.readouterr() - assert 'Anonymity range:' in out - assert 'Size:' in out - assert 'Values' in out + assert "Anonymity range:" in out + assert "Size:" in out + assert "Values" in out @flaky -def test_k_map_estimate_analysis_multiple_field( - topic_id, subscription_id, capsys): +def test_k_map_estimate_analysis_multiple_field(topic_id, subscription_id, capsys): risk.k_map_estimate_analysis( GCLOUD_PROJECT, TABLE_PROJECT, DATASET_ID, - 'harmful', + "harmful", topic_id, subscription_id, [NUMERIC_FIELD, STRING_BOOLEAN_FIELD], - ['AGE', 'GENDER']) + ["AGE", "GENDER"], + ) out, _ = capsys.readouterr() - assert 'Anonymity range:' in out - assert 'Size:' in out - assert 'Values' in out + assert "Anonymity range:" in out + assert "Size:" in out + assert "Values" in out @flaky -def test_k_map_estimate_analysis_quasi_ids_info_types_equal( - topic_id, subscription_id): +def test_k_map_estimate_analysis_quasi_ids_info_types_equal(topic_id, subscription_id): with pytest.raises(ValueError): risk.k_map_estimate_analysis( GCLOUD_PROJECT, TABLE_PROJECT, DATASET_ID, - 'harmful', + "harmful", topic_id, subscription_id, [NUMERIC_FIELD, STRING_BOOLEAN_FIELD], - ['AGE']) + ["AGE"], + ) diff --git a/dlp/templates.py b/dlp/templates.py index 85d8a0bb494e..9e29245a248d 100644 --- a/dlp/templates.py +++ b/dlp/templates.py @@ -22,10 +22,15 @@ # [START dlp_create_template] -def create_inspect_template(project, info_types, - template_id=None, display_name=None, - min_likelihood=None, max_findings=None, - include_quote=None): +def create_inspect_template( + project, + info_types, + template_id=None, + display_name=None, + min_likelihood=None, + max_findings=None, + include_quote=None, +): """Creates a Data Loss Prevention API inspect template. Args: project: The Google Cloud project id to use as a parent resource. @@ -48,34 +53,33 @@ def create_inspect_template(project, info_types, import google.cloud.dlp # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). - info_types = [{'name': info_type} for info_type in info_types] + info_types = [{"name": info_type} for info_type in info_types] # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { - 'info_types': info_types, - 'min_likelihood': min_likelihood, - 'include_quote': include_quote, - 'limits': {'max_findings_per_request': max_findings}, + "info_types": info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, } - inspect_template = { - 'inspect_config': inspect_config, - 'display_name': display_name, - } + inspect_template = {"inspect_config": inspect_config, "display_name": display_name} # Convert the project id into a full resource id. parent = dlp.project_path(project) # Call the API. response = dlp.create_inspect_template( - parent, inspect_template=inspect_template, template_id=template_id) + parent, inspect_template=inspect_template, template_id=template_id + ) + + print("Successfully created template {}".format(response.name)) - print('Successfully created template {}'.format(response.name)) # [END dlp_create_template] @@ -93,7 +97,7 @@ def list_inspect_templates(project): import google.cloud.dlp # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) @@ -107,22 +111,24 @@ def human_readable_time(timestamp): return str(time.localtime(timestamp.seconds)) for template in response: - print('Template {}:'.format(template.name)) + print("Template {}:".format(template.name)) if template.display_name: - print(' Display Name: {}'.format(template.display_name)) - print(' Created: {}'.format( - human_readable_time(template.create_time))) - print(' Updated: {}'.format( - human_readable_time(template.update_time))) + print(" Display Name: {}".format(template.display_name)) + print(" Created: {}".format(human_readable_time(template.create_time))) + print(" Updated: {}".format(human_readable_time(template.update_time))) config = template.inspect_config - print(' InfoTypes: {}'.format(', '.join( - [it.name for it in config.info_types] - ))) - print(' Minimum likelihood: {}'.format(config.min_likelihood)) - print(' Include quotes: {}'.format(config.include_quote)) - print(' Max findings per request: {}'.format( - config.limits.max_findings_per_request)) + print( + " InfoTypes: {}".format(", ".join([it.name for it in config.info_types])) + ) + print(" Minimum likelihood: {}".format(config.min_likelihood)) + print(" Include quotes: {}".format(config.include_quote)) + print( + " Max findings per request: {}".format( + config.limits.max_findings_per_request + ) + ) + # [END dlp_list_templates] @@ -141,89 +147,108 @@ def delete_inspect_template(project, template_id): import google.cloud.dlp # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Combine the template id with the parent id. - template_resource = '{}/inspectTemplates/{}'.format(parent, template_id) + template_resource = "{}/inspectTemplates/{}".format(parent, template_id) # Call the API. dlp.delete_inspect_template(template_resource) - print('Template {} successfully deleted.'.format(template_resource)) + print("Template {} successfully deleted.".format(template_resource)) + # [END dlp_delete_template] -if __name__ == '__main__': - default_project = os.environ.get('GCLOUD_PROJECT') +if __name__ == "__main__": + default_project = os.environ.get("GCLOUD_PROJECT") parser = argparse.ArgumentParser(description=__doc__) subparsers = parser.add_subparsers( - dest='action', help='Select which action to perform.') + dest="action", help="Select which action to perform." + ) subparsers.required = True - parser_create = subparsers.add_parser('create', help='Create a template.') + parser_create = subparsers.add_parser("create", help="Create a template.") parser_create.add_argument( - '--template_id', - help='The id of the template. If omitted, an id will be randomly ' - 'generated') + "--template_id", + help="The id of the template. If omitted, an id will be randomly " "generated", + ) parser_create.add_argument( - '--display_name', - help='The optional display name of the template.') + "--display_name", help="The optional display name of the template." + ) parser_create.add_argument( - '--project', - help='The Google Cloud project id to use as a parent resource.', - default=default_project) + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) parser_create.add_argument( - '--info_types', nargs='+', - help='Strings representing info types to look for. A full list of ' - 'info categories and types is available from the API. Examples ' - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - 'If unspecified, the three above examples will be used.', - default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) parser_create.add_argument( - '--min_likelihood', - choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', - 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], - help='A string representing the minimum likelihood threshold that ' - 'constitutes a match.') + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) parser_create.add_argument( - '--max_findings', type=int, - help='The maximum number of findings to report; 0 = no maximum.') + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) parser_create.add_argument( - '--include_quote', type=bool, - help='A boolean for whether to display a quote of the detected ' - 'information in the results.', - default=True) - - parser_list = subparsers.add_parser('list', help='List all templates.') + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) + + parser_list = subparsers.add_parser("list", help="List all templates.") parser_list.add_argument( - '--project', - help='The Google Cloud project id to use as a parent resource.', - default=default_project) + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) - parser_delete = subparsers.add_parser('delete', help='Delete a template.') - parser_delete.add_argument( - 'template_id', - help='The id of the template to delete.') + parser_delete = subparsers.add_parser("delete", help="Delete a template.") + parser_delete.add_argument("template_id", help="The id of the template to delete.") parser_delete.add_argument( - '--project', - help='The Google Cloud project id to use as a parent resource.', - default=default_project) + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) args = parser.parse_args() - if args.action == 'create': + if args.action == "create": create_inspect_template( - args.project, args.info_types, - template_id=args.template_id, display_name=args.display_name, + args.project, + args.info_types, + template_id=args.template_id, + display_name=args.display_name, min_likelihood=args.min_likelihood, - max_findings=args.max_findings, include_quote=args.include_quote + max_findings=args.max_findings, + include_quote=args.include_quote, ) - elif args.action == 'list': + elif args.action == "list": list_inspect_templates(args.project) - elif args.action == 'delete': + elif args.action == "delete": delete_inspect_template(args.project, args.template_id) diff --git a/dlp/templates_test.py b/dlp/templates_test.py index 776096719ef7..dff157a9ee67 100644 --- a/dlp/templates_test.py +++ b/dlp/templates_test.py @@ -20,14 +20,15 @@ import templates -GCLOUD_PROJECT = os.getenv('GCLOUD_PROJECT') -TEST_TEMPLATE_ID = 'test-template' +GCLOUD_PROJECT = os.getenv("GCLOUD_PROJECT") +TEST_TEMPLATE_ID = "test-template" def test_create_list_and_delete_template(capsys): try: templates.create_inspect_template( - GCLOUD_PROJECT, ['FIRST_NAME', 'EMAIL_ADDRESS', 'PHONE_NUMBER'], + GCLOUD_PROJECT, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], template_id=TEST_TEMPLATE_ID, ) except google.api_core.exceptions.InvalidArgument: @@ -39,7 +40,8 @@ def test_create_list_and_delete_template(capsys): # Try again and move on. templates.create_inspect_template( - GCLOUD_PROJECT, ['FIRST_NAME', 'EMAIL_ADDRESS', 'PHONE_NUMBER'], + GCLOUD_PROJECT, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], template_id=TEST_TEMPLATE_ID, ) diff --git a/dlp/triggers.py b/dlp/triggers.py index a486386e7675..c786cf6e5473 100644 --- a/dlp/triggers.py +++ b/dlp/triggers.py @@ -22,10 +22,18 @@ # [START dlp_create_trigger] -def create_trigger(project, bucket, scan_period_days, info_types, - trigger_id=None, display_name=None, description=None, - min_likelihood=None, max_findings=None, - auto_populate_timespan=False): +def create_trigger( + project, + bucket, + scan_period_days, + info_types, + trigger_id=None, + display_name=None, + description=None, + min_likelihood=None, + max_findings=None, + auto_populate_timespan=False, +): """Creates a scheduled Data Loss Prevention API inspect_content trigger. Args: project: The Google Cloud project id to use as a parent resource. @@ -53,56 +61,47 @@ def create_trigger(project, bucket, scan_period_days, info_types, import google.cloud.dlp # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Prepare info_types by converting the list of strings into a list of # dictionaries (protos are also accepted). - info_types = [{'name': info_type} for info_type in info_types] + info_types = [{"name": info_type} for info_type in info_types] # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { - 'info_types': info_types, - 'min_likelihood': min_likelihood, - 'limits': {'max_findings_per_request': max_findings}, + "info_types": info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, } # Construct a cloud_storage_options dictionary with the bucket's URL. - url = 'gs://{}/*'.format(bucket) + url = "gs://{}/*".format(bucket) storage_config = { - 'cloud_storage_options': { - 'file_set': {'url': url} - }, + "cloud_storage_options": {"file_set": {"url": url}}, # Time-based configuration for each storage object. - 'timespan_config': { + "timespan_config": { # Auto-populate start and end times in order to scan new objects # only. - 'enable_auto_population_of_timespan_config': auto_populate_timespan + "enable_auto_population_of_timespan_config": auto_populate_timespan }, } # Construct the job definition. - job = { - 'inspect_config': inspect_config, - 'storage_config': storage_config, - } + job = {"inspect_config": inspect_config, "storage_config": storage_config} # Construct the schedule definition: schedule = { - 'recurrence_period_duration': { - 'seconds': scan_period_days * 60 * 60 * 24, - } + "recurrence_period_duration": {"seconds": scan_period_days * 60 * 60 * 24} } # Construct the trigger definition. job_trigger = { - 'inspect_job': job, - 'display_name': display_name, - 'description': description, - 'triggers': [ - {'schedule': schedule} - ], - 'status': 'HEALTHY' + "inspect_job": job, + "display_name": display_name, + "description": description, + "triggers": [{"schedule": schedule}], + "status": "HEALTHY", } # Convert the project id into a full resource id. @@ -110,9 +109,11 @@ def create_trigger(project, bucket, scan_period_days, info_types, # Call the API. response = dlp.create_job_trigger( - parent, job_trigger=job_trigger, trigger_id=trigger_id) + parent, job_trigger=job_trigger, trigger_id=trigger_id + ) + + print("Successfully created trigger {}".format(response.name)) - print('Successfully created trigger {}'.format(response.name)) # [END dlp_create_trigger] @@ -130,7 +131,7 @@ def list_triggers(project): import google.cloud.dlp # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) @@ -144,15 +145,16 @@ def human_readable_time(timestamp): return str(time.localtime(timestamp.seconds)) for trigger in response: - print('Trigger {}:'.format(trigger.name)) - print(' Created: {}'.format(human_readable_time(trigger.create_time))) - print(' Updated: {}'.format(human_readable_time(trigger.update_time))) + print("Trigger {}:".format(trigger.name)) + print(" Created: {}".format(human_readable_time(trigger.create_time))) + print(" Updated: {}".format(human_readable_time(trigger.update_time))) if trigger.display_name: - print(' Display Name: {}'.format(trigger.display_name)) + print(" Display Name: {}".format(trigger.display_name)) if trigger.description: - print(' Description: {}'.format(trigger.discription)) - print(' Status: {}'.format(trigger.status)) - print(' Error count: {}'.format(len(trigger.errors))) + print(" Description: {}".format(trigger.discription)) + print(" Status: {}".format(trigger.status)) + print(" Error count: {}".format(len(trigger.errors))) + # [END dlp_list_triggers] @@ -171,96 +173,118 @@ def delete_trigger(project, trigger_id): import google.cloud.dlp # Instantiate a client. - dlp = google.cloud.dlp.DlpServiceClient() + dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. parent = dlp.project_path(project) # Combine the trigger id with the parent id. - trigger_resource = '{}/jobTriggers/{}'.format(parent, trigger_id) + trigger_resource = "{}/jobTriggers/{}".format(parent, trigger_id) # Call the API. dlp.delete_job_trigger(trigger_resource) - print('Trigger {} successfully deleted.'.format(trigger_resource)) + print("Trigger {} successfully deleted.".format(trigger_resource)) + # [END dlp_delete_triggers] -if __name__ == '__main__': - default_project = os.environ.get('GCLOUD_PROJECT') +if __name__ == "__main__": + default_project = os.environ.get("GCLOUD_PROJECT") parser = argparse.ArgumentParser(description=__doc__) subparsers = parser.add_subparsers( - dest='action', help='Select which action to perform.') + dest="action", help="Select which action to perform." + ) subparsers.required = True - parser_create = subparsers.add_parser('create', help='Create a trigger.') + parser_create = subparsers.add_parser("create", help="Create a trigger.") parser_create.add_argument( - 'bucket', help='The name of the GCS bucket containing the file.') + "bucket", help="The name of the GCS bucket containing the file." + ) parser_create.add_argument( - 'scan_period_days', type=int, - help='How often to repeat the scan, in days. The minimum is 1 day.') + "scan_period_days", + type=int, + help="How often to repeat the scan, in days. The minimum is 1 day.", + ) parser_create.add_argument( - '--trigger_id', - help='The id of the trigger. If omitted, an id will be randomly ' - 'generated') + "--trigger_id", + help="The id of the trigger. If omitted, an id will be randomly " "generated", + ) parser_create.add_argument( - '--display_name', - help='The optional display name of the trigger.') + "--display_name", help="The optional display name of the trigger." + ) parser_create.add_argument( - '--description', - help='The optional description of the trigger.') + "--description", help="The optional description of the trigger." + ) parser_create.add_argument( - '--project', - help='The Google Cloud project id to use as a parent resource.', - default=default_project) + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) parser_create.add_argument( - '--info_types', nargs='+', - help='Strings representing info types to look for. A full list of ' - 'info categories and types is available from the API. Examples ' - 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' - 'If unspecified, the three above examples will be used.', - default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) parser_create.add_argument( - '--min_likelihood', - choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', - 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], - help='A string representing the minimum likelihood threshold that ' - 'constitutes a match.') + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) parser_create.add_argument( - '--max_findings', type=int, - help='The maximum number of findings to report; 0 = no maximum.') + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) parser_create.add_argument( - '--auto_populate_timespan', type=bool, - help='Limit scan to new content only.') + "--auto_populate_timespan", type=bool, help="Limit scan to new content only." + ) - parser_list = subparsers.add_parser('list', help='List all triggers.') + parser_list = subparsers.add_parser("list", help="List all triggers.") parser_list.add_argument( - '--project', - help='The Google Cloud project id to use as a parent resource.', - default=default_project) + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) - parser_delete = subparsers.add_parser('delete', help='Delete a trigger.') - parser_delete.add_argument( - 'trigger_id', - help='The id of the trigger to delete.') + parser_delete = subparsers.add_parser("delete", help="Delete a trigger.") + parser_delete.add_argument("trigger_id", help="The id of the trigger to delete.") parser_delete.add_argument( - '--project', - help='The Google Cloud project id to use as a parent resource.', - default=default_project) + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) args = parser.parse_args() - if args.action == 'create': + if args.action == "create": create_trigger( - args.project, args.bucket, args.scan_period_days, args.info_types, - trigger_id=args.trigger_id, display_name=args.display_name, - description=args.description, min_likelihood=args.min_likelihood, + args.project, + args.bucket, + args.scan_period_days, + args.info_types, + trigger_id=args.trigger_id, + display_name=args.display_name, + description=args.description, + min_likelihood=args.min_likelihood, max_findings=args.max_findings, auto_populate_timespan=args.auto_populate_timespan, ) - elif args.action == 'list': + elif args.action == "list": list_triggers(args.project) - elif args.action == 'delete': + elif args.action == "delete": delete_trigger(args.project, args.trigger_id) diff --git a/dlp/triggers_test.py b/dlp/triggers_test.py index 18181886fb76..6a9d7d792613 100644 --- a/dlp/triggers_test.py +++ b/dlp/triggers_test.py @@ -22,14 +22,14 @@ import triggers -GCLOUD_PROJECT = os.getenv('GCLOUD_PROJECT') -TEST_BUCKET_NAME = GCLOUD_PROJECT + '-dlp-python-client-test' -RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), 'resources') -RESOURCE_FILE_NAMES = ['test.txt', 'test.png', 'harmless.txt', 'accounts.txt'] -TEST_TRIGGER_ID = 'test-trigger' +GCLOUD_PROJECT = os.getenv("GCLOUD_PROJECT") +TEST_BUCKET_NAME = GCLOUD_PROJECT + "-dlp-python-client-test" +RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources") +RESOURCE_FILE_NAMES = ["test.txt", "test.png", "harmless.txt", "accounts.txt"] +TEST_TRIGGER_ID = "test-trigger" -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def bucket(): # Creates a GCS bucket, uploads files required for the test, and tears down # the entire bucket afterwards. @@ -56,7 +56,7 @@ def bucket(): try: blob.delete() except google.cloud.exceptions.NotFound: - print('Issue during teardown, missing blob') + print("Issue during teardown, missing blob") # Attempt to delete the bucket; this will only work if it is empty. bucket.delete() @@ -65,8 +65,10 @@ def bucket(): def test_create_list_and_delete_trigger(bucket, capsys): try: triggers.create_trigger( - GCLOUD_PROJECT, bucket.name, 7, - ['FIRST_NAME', 'EMAIL_ADDRESS', 'PHONE_NUMBER'], + GCLOUD_PROJECT, + bucket.name, + 7, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], trigger_id=TEST_TRIGGER_ID, ) except google.api_core.exceptions.InvalidArgument: @@ -78,8 +80,10 @@ def test_create_list_and_delete_trigger(bucket, capsys): # Try again and move on. triggers.create_trigger( - GCLOUD_PROJECT, bucket.name, 7, - ['FIRST_NAME', 'EMAIL_ADDRESS', 'PHONE_NUMBER'], + GCLOUD_PROJECT, + bucket.name, + 7, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], trigger_id=TEST_TRIGGER_ID, auto_populate_timespan=True, )