Skip to content

Commit

Permalink
Add DLP code sample and test for de-id free text with surrogate (#4085)
Browse files Browse the repository at this point in the history
## Description
Add DLP code sample and test for de-id free text with surrogate, meant for https://cloud.google.com/dlp/docs/pseudonymization#de-identification_in_free_text_code_example

## Checklist
- [x] I have followed [Sample Guidelines from AUTHORING_GUIDE.MD](https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/AUTHORING_GUIDE.md)
- [ ] README is updated to include [all relevant information](https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/AUTHORING_GUIDE.md#readme-file)
- [x] **Tests** pass:   `nox -s py-3.6` (see [Test Enviroment Setup](https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/AUTHORING_GUIDE.md#test-environment-setup))
- [x] **Lint** pass:   `nox -s lint` (see [Test Enviroment Setup](https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/AUTHORING_GUIDE.md#test-environment-setup))
- [ ] These samples need a new **API enabled** in testing projects to pass (let us know which ones)
- [ ] These samples need a new/updated **env vars** in testing projects set to pass (let us know which ones)
- [x] Please **merge** this PR for me once it is approved.
  • Loading branch information
lxhfirenking authored Jun 15, 2020
1 parent 8acf0d6 commit 629c0a8
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 0 deletions.
87 changes: 87 additions & 0 deletions dlp/deid.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,93 @@ def reidentify_with_fpe(

# [END dlp_reidentify_fpe]


# [START dlp_deidentify_free_text_with_fpe_using_surrogate]
def deidentify_free_text_with_fpe_using_surrogate(
project,
input_str,
alphabet="NUMERIC",
info_type="PHONE_NUMBER",
surrogate_type="PHONE_TOKEN",
unwrapped_key="YWJjZGVmZ2hpamtsbW5vcA==",
):
"""Uses the Data Loss Prevention API to deidentify sensitive data in a
string using Format Preserving Encryption (FPE).
The encryption is performed with an unwrapped key.
Args:
project: The Google Cloud project id to use as a parent resource.
input_str: The string to deidentify (will be treated as text).
alphabet: The set of characters to replace sensitive ones with. For
more information, see https://cloud.google.com/dlp/docs/reference/
rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
info_type: The name of the info type to de-identify
surrogate_type: The name of the surrogate custom info type to use. Can
be essentially any arbitrary string, as long as it doesn't appear
in your dataset otherwise.
unwrapped_key: The base64-encoded AES-256 key to use.
Returns:
None; the response from the API is printed to the terminal.
"""
# Import the client library
import google.cloud.dlp

# Instantiate a client
dlp = google.cloud.dlp_v2.DlpServiceClient()

# Convert the project id into a full resource id.
parent = dlp.project_path(project)

# The unwrapped key is base64-encoded, but the library expects a binary
# string, so decode it here.
import base64

unwrapped_key = base64.b64decode(unwrapped_key)

# Construct de-identify config
transformation = {
"info_types": [{"name": info_type}],
"primitive_transformation": {
"crypto_replace_ffx_fpe_config": {
"crypto_key": {
"unwrapped": {"key": unwrapped_key}
},
"common_alphabet": alphabet,
"surrogate_info_type": {"name": surrogate_type},
}
}
}

deidentify_config = {
"info_type_transformations": {
"transformations": [transformation]
}
}

# Construct the inspect config, trying to finding all PII with likelihood
# higher than UNLIKELY
inspect_config = {
"info_types": [{"name": info_type}],
"min_likelihood": "UNLIKELY"
}

# Convert string to item
item = {"value": input_str}

# Call the API
response = dlp.deidentify_content(
parent,
inspect_config=inspect_config,
deidentify_config=deidentify_config,
item=item,
)

# Print results
print(response.item.value)


# [END dlp_deidentify_free_text_with_fpe_using_surrogate]


# [START dlp_reidentify_free_text_with_fpe_using_surrogate]
def reidentify_free_text_with_fpe_using_surrogate(
project,
Expand Down
19 changes: 19 additions & 0 deletions dlp/deid_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,25 @@ def test_reidentify_with_fpe(capsys):
assert "731997681" not in out


def test_deidentify_free_text_with_fpe_using_surrogate(capsys):
labeled_fpe_string = "My phone number is 4359916732"

deid.deidentify_free_text_with_fpe_using_surrogate(
GCLOUD_PROJECT,
labeled_fpe_string,
info_type="PHONE_NUMBER",
surrogate_type="PHONE_TOKEN",
unwrapped_key=UNWRAPPED_KEY,
alphabet="NUMERIC",
)

out, _ = capsys.readouterr()

assert "PHONE_TOKEN" in out
assert "My phone number is" in out
assert "4359916732" not in out


def test_reidentify_free_text_with_fpe_using_surrogate(capsys):
labeled_fpe_string = "My phone number is PHONE_TOKEN(10):9617256398"

Expand Down

0 comments on commit 629c0a8

Please sign in to comment.