From b63f9327e57e5ca50bd01889af9df89df3d918ac Mon Sep 17 00:00:00 2001 From: Seth Moore Date: Thu, 4 Jun 2020 13:16:21 -0700 Subject: [PATCH 1/3] Add text redaction sample using DLP --- dlp/README.rst | 9 ++--- dlp/deid.py | 86 ++++++++++++++++++++++++++++++++++++++++++++++++ dlp/deid_test.py | 14 ++++++++ 3 files changed, 105 insertions(+), 4 deletions(-) diff --git a/dlp/README.rst b/dlp/README.rst index 76bd9dd8dfc6..9ef0fc3fa147 100644 --- a/dlp/README.rst +++ b/dlp/README.rst @@ -339,13 +339,12 @@ To run this sample: .. code-block:: bash $ python deid.py - - usage: deid.py [-h] {deid_mask,deid_fpe,reid_fpe,deid_date_shift} ... + usage: deid.py [-h] {deid_mask,deid_fpe,reid_fpe,deid_date_shift,redact} ... Uses of the Data Loss Prevention API for deidentifying sensitive data. positional arguments: - {deid_mask,deid_fpe,reid_fpe,deid_date_shift} + {deid_mask,deid_fpe,reid_fpe,deid_date_shift,redact} Select how to submit content to the API. deid_mask Deidentify sensitive data in a string by masking it with a character. @@ -355,6 +354,8 @@ To run this sample: Preserving Encryption (FPE). deid_date_shift Deidentify dates in a CSV file by pseudorandomly shifting them. + redact Redact sensitive data in a string by replacing it with + the info type of the data. optional arguments: -h, --help show this help message and exit @@ -378,4 +379,4 @@ to `browse the source`_ and `report issues`_. https://github.com/GoogleCloudPlatform/google-cloud-python/issues -.. _Google Cloud SDK: https://cloud.google.com/sdk/ \ No newline at end of file +.. _Google Cloud SDK: https://cloud.google.com/sdk/ diff --git a/dlp/deid.py b/dlp/deid.py index b08a341dd82e..27c2d0a36631 100644 --- a/dlp/deid.py +++ b/dlp/deid.py @@ -435,6 +435,62 @@ def write_data(data): # [END dlp_deidentify_date_shift] +# [START dlp_redact_sensitive_data] +def redact_sensitive_data(project, string, info_types): + """Uses the Data Loss Prevention API to redact sensitive data in a + string by replacing it with the info type. + Args: + project: The Google Cloud project id to use as a parent resource. + item: The string to redact (will be treated as text). + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Construct inspect configuration dictionary + inspect_config = { + "info_types": [{"name": info_type} for info_type in info_types] + } + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "replace_with_info_type_config": {} + } + } + ] + } + } + + # Construct item + item = {"value": string} + + # Call the API + response = dlp.deidentify_content( + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item=item, + ) + + # Print out the results. + print(response.item.value) + + +# [END dlp_redact_sensitive_data] + + if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) subparsers = parser.add_subparsers( @@ -626,6 +682,30 @@ def write_data(data): "key_name.", ) + redact_parser = subparsers.add_parser( + "redact", + help="Redact sensitive data in a string by replacing it with the " + "info type of the data.", + ) + redact_parser.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + redact_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + redact_parser.add_argument( + "item", + help="The string to redact." + "Example: 'My credit card is 4242 4242 4242 4242'", + ) + args = parser.parse_args() if args.content == "deid_mask": @@ -667,3 +747,9 @@ def write_data(data): wrapped_key=args.wrapped_key, key_name=args.key_name, ) + elif args.content == "redact": + redact_sensitive_data( + args.project, + string=args.item, + info_types=args.info_types, + ) diff --git a/dlp/deid_test.py b/dlp/deid_test.py index db14b5758e96..db0c94e35dd6 100644 --- a/dlp/deid_test.py +++ b/dlp/deid_test.py @@ -185,3 +185,17 @@ def test_reidentify_with_fpe(capsys): out, _ = capsys.readouterr() assert "731997681" not in out + + +def test_redact_sensitive_data(capsys): + url_to_redact = "https://cloud.google.com" + deid.redact_sensitive_data( + GCLOUD_PROJECT, + "My favorite site is " + url_to_redact, + ["URL"], + ) + + out, _ = capsys.readouterr() + + assert url_to_redact not in out + assert "My favorite site is [URL]" in out From 9aa984fe9aa8acf43f54cf02b5c236c71e0e7543 Mon Sep 17 00:00:00 2001 From: Seth Moore Date: Fri, 5 Jun 2020 10:00:50 -0700 Subject: [PATCH 2/3] Update dlp/deid.py Co-authored-by: Bu Sun Kim <8822365+busunkim96@users.noreply.github.com> --- dlp/deid.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dlp/deid.py b/dlp/deid.py index 27c2d0a36631..1e68b1f813ce 100644 --- a/dlp/deid.py +++ b/dlp/deid.py @@ -442,6 +442,8 @@ def redact_sensitive_data(project, string, info_types): Args: project: The Google Cloud project id to use as a parent resource. item: The string to redact (will be treated as text). + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. Returns: None; the response from the API is printed to the terminal. """ From 8151c31d3cd98ca7575a3f5ab899e5f7d3195801 Mon Sep 17 00:00:00 2001 From: Seth Moore Date: Fri, 5 Jun 2020 10:39:38 -0700 Subject: [PATCH 3/3] Rename string parameter to item --- dlp/deid.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/dlp/deid.py b/dlp/deid.py index 1e68b1f813ce..81847690866c 100644 --- a/dlp/deid.py +++ b/dlp/deid.py @@ -436,7 +436,7 @@ def write_data(data): # [START dlp_redact_sensitive_data] -def redact_sensitive_data(project, string, info_types): +def redact_sensitive_data(project, item, info_types): """Uses the Data Loss Prevention API to redact sensitive data in a string by replacing it with the info type. Args: @@ -475,15 +475,12 @@ def redact_sensitive_data(project, string, info_types): } } - # Construct item - item = {"value": string} - # Call the API response = dlp.deidentify_content( parent, inspect_config=inspect_config, deidentify_config=deidentify_config, - item=item, + item={"value": item}, ) # Print out the results. @@ -752,6 +749,6 @@ def redact_sensitive_data(project, string, info_types): elif args.content == "redact": redact_sensitive_data( args.project, - string=args.item, + item=args.item, info_types=args.info_types, )