diff --git a/dlp/custom_infotype.py b/dlp/custom_infotype.py new file mode 100644 index 000000000000..3493476380fa --- /dev/null +++ b/dlp/custom_infotype.py @@ -0,0 +1,85 @@ +# Copyright 2020 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Custom infoType snippets. + +This file contains sample code that uses the Data Loss Prevention API to create +custom infoType detectors to refine scan results. +""" + + +# [START dlp_omit_name_if_also_email] +def omit_name_if_also_email( + project, + content_string, +): + """Marches PERSON_NAME and EMAIL_ADDRESS, but not both. + + Uses the Data Loss Prevention API omit matches on PERSON_NAME if the + EMAIL_ADDRESS detector also matches. + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a list of infoTypes for DLP to locate in `content_string`. See + # https://cloud.google.com/dlp/docs/concepts-infotypes for more information + # about supported infoTypes. + info_types_to_locate = [{"name": "PERSON_NAME"}, {"name": "EMAIL_ADDRESS"}] + + # Construct the configuration dictionary that will only match on PERSON_NAME + # if the EMAIL_ADDRESS doesn't also match. This configuration helps reduce + # the total number of findings when there is a large overlap between different + # infoTypes. + inspect_config = { + "info_types": + info_types_to_locate, + "rule_set": [{ + "info_types": [{ + "name": "PERSON_NAME" + }], + "rules": [{ + "exclusion_rule": { + "exclude_info_types": { + "info_types": [{ + "name": "EMAIL_ADDRESS" + }] + }, + "matching_type": "MATCHING_TYPE_PARTIAL_MATCH" + } + }] + }] + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + return [f.info_type.name for f in response.result.findings] + + +# [END dlp_omit_name_if_also_email] diff --git a/dlp/custom_infotype_test.py b/dlp/custom_infotype_test.py new file mode 100644 index 000000000000..521b09650b67 --- /dev/null +++ b/dlp/custom_infotype_test.py @@ -0,0 +1,28 @@ +# Copyright 2020 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import custom_infotype + +GCLOUD_PROJECT = os.getenv("GCLOUD_PROJECT") + + +def test_omit_name_if_also_email(capsys): + info_types = custom_infotype.omit_name_if_also_email( + GCLOUD_PROJECT, "alice@example.com") + + # Ensure we found only EMAIL_ADDRESS, and not PERSON_NAME. + assert len(info_types) == 1 + assert info_types[0] == "EMAIL_ADDRESS"