From 3102486ff151670944bfcf731abdf254bb0fc3c8 Mon Sep 17 00:00:00 2001 From: Takashi Matsuo Date: Wed, 3 Jun 2020 16:34:49 -0700 Subject: [PATCH] [dlp] testing: fix Pub/Sub notifications (#3925) * re-generated README.rst with some more setup info * use parent with the global location attached * re-enabled some tests with Pub/Sub notification * stop waiting between test retries --- dlp/README.rst | 34 ++++++++-------- dlp/README.rst.in | 8 +++- dlp/conftest.py | 20 ---------- dlp/inspect_content.py | 21 +++++----- dlp/inspect_content_test.py | 37 ++++++++++------- dlp/risk.py | 31 +++++++++------ dlp/risk_test.py | 42 ++++++-------------- scripts/readme-gen/templates/README.tmpl.rst | 7 ++++ 8 files changed, 98 insertions(+), 102 deletions(-) delete mode 100644 dlp/conftest.py diff --git a/dlp/README.rst b/dlp/README.rst index ce8b85500249..76bd9dd8dfc6 100644 --- a/dlp/README.rst +++ b/dlp/README.rst @@ -14,6 +14,15 @@ This directory contains samples for Google Data Loss Prevention. `Google Data Lo .. _Google Data Loss Prevention: https://cloud.google.com/dlp/docs/ +To run the sample, you need to enable the API at: https://console.cloud.google.com/apis/library/dlp.googleapis.com + + +To run the sample, you need to have the following roles: +* `DLP Administrator` +* `DLP API Service Agent` + + + Setup ------------------------------------------------------------------------------- @@ -58,15 +67,6 @@ Install Dependencies .. _pip: https://pip.pypa.io/ .. _virtualenv: https://virtualenv.pypa.io/ -#. For running *_test.py files, install test dependencies - - .. code-block:: bash - - $ pip install -r requirements-test.txt - $ pytest inspect_content_test.py - -** *_test.py files are demo wrappers and make API calls. You may get rate limited for making high number of requests. ** - Samples ------------------------------------------------------------------------------- @@ -83,7 +83,7 @@ To run this sample: .. code-block:: bash - $ python quickstart.py + $ python quickstart.py Inspect Content @@ -101,15 +101,16 @@ To run this sample: $ python inspect_content.py - usage: inspect_content.py [-h] {string,file,gcs,datastore,bigquery} ... + usage: inspect_content.py [-h] {string,table,file,gcs,datastore,bigquery} ... Sample app that uses the Data Loss Prevention API to inspect a string, a local file or a file on Google Cloud Storage. positional arguments: - {string,file,gcs,datastore,bigquery} + {string,table,file,gcs,datastore,bigquery} Select how to submit content to the API. string Inspect a string. + table Inspect a table. file Inspect a local file. gcs Inspect files on Google Cloud Storage. datastore Inspect files on Google Datastore. @@ -135,13 +136,14 @@ To run this sample: $ python redact.py - usage: redact.py [-h] [--project PROJECT] [--info_types INFO_TYPES] + usage: redact.py [-h] [--project PROJECT] + [--info_types INFO_TYPES [INFO_TYPES ...]] [--min_likelihood {LIKELIHOOD_UNSPECIFIED,VERY_UNLIKELY,UNLIKELY,POSSIBLE,LIKELY,VERY_LIKELY}] [--mime_type MIME_TYPE] filename output_filename - Sample app that uses the Data Loss Prevent API to redact the contents of a - string or an image file. + Sample app that uses the Data Loss Prevent API to redact the contents of an + image file. positional arguments: filename The path to the file to inspect. @@ -151,7 +153,7 @@ To run this sample: -h, --help show this help message and exit --project PROJECT The Google Cloud project id to use as a parent resource. - --info_types INFO_TYPES + --info_types INFO_TYPES [INFO_TYPES ...] Strings representing info types to look for. A full list of info categories and types is available from the API. Examples include "FIRST_NAME", "LAST_NAME", diff --git a/dlp/README.rst.in b/dlp/README.rst.in index 8a143392b17e..708e870fa08a 100644 --- a/dlp/README.rst.in +++ b/dlp/README.rst.in @@ -4,7 +4,7 @@ product: name: Google Data Loss Prevention short_name: Data Loss Prevention url: https://cloud.google.com/dlp/docs/ - description: > + description: > `Google Data Loss Prevention`_ provides programmatic access to a powerful detection engine for personally identifiable information and other privacy-sensitive data in unstructured data streams. @@ -13,6 +13,12 @@ setup: - auth - install_deps +required_api_url: https://console.cloud.google.com/apis/library/dlp.googleapis.com + +required_roles: +- DLP Administrator +- DLP API Service Agent + samples: - name: Quickstart file: quickstart.py diff --git a/dlp/conftest.py b/dlp/conftest.py deleted file mode 100644 index 362e5a2c2711..000000000000 --- a/dlp/conftest.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - - -# Used in risk_test.py to limit the maximum wait time before the flaky retries. -def pytest_configure(config): - pytest.MAX_FLAKY_WAIT = 3600 # maximum of an hour diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index 6d6baad4827a..1c5f9c4df958 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -459,11 +459,12 @@ def inspect_gcs_file( url = "gs://{}/{}".format(bucket, filename) storage_config = {"cloud_storage_options": {"file_set": {"url": url}}} - # Convert the project id into a full resource id. - parent = dlp.project_path(project) + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] + actions = [{"pub_sub": {"topic": topic}}] # Construct the inspect_job, which defines the entire inspect content task. inspect_job = { @@ -623,11 +624,12 @@ def inspect_datastore( } } - # Convert the project id into a full resource id. - parent = dlp.project_path(project) + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] + actions = [{"pub_sub": {"topic": topic}}] # Construct the inspect_job, which defines the entire inspect content task. inspect_job = { @@ -790,11 +792,12 @@ def inspect_bigquery( } } - # Convert the project id into a full resource id. - parent = dlp.project_path(project) + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] + actions = [{"pub_sub": {"topic": topic}}] # Construct the inspect_job, which defines the entire inspect content task. inspect_job = { diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py index ea100d16d84a..e2192bdd6c41 100644 --- a/dlp/inspect_content_test.py +++ b/dlp/inspect_content_test.py @@ -40,6 +40,8 @@ BIGQUERY_DATASET_ID = "dlp_test_dataset" + UNIQUE_STRING BIGQUERY_TABLE_ID = "dlp_test_table" + UNIQUE_STRING +TIMEOUT = 300 # 5 minutes + @pytest.fixture(scope="module") def bucket(): @@ -298,6 +300,7 @@ def cancel_operation(out): client.cancel_dlp_job(operation_id) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_inspect_gcs_file(bucket, topic_id, subscription_id, capsys): try: inspect_content.inspect_gcs_file( @@ -307,15 +310,16 @@ def test_inspect_gcs_file(bucket, topic_id, subscription_id, capsys): topic_id, subscription_id, ["EMAIL_ADDRESS", "PHONE_NUMBER"], - timeout=1 + timeout=TIMEOUT ) out, _ = capsys.readouterr() - assert "Inspection operation started" in out + assert "Info type: EMAIL_ADDRESS" in out finally: cancel_operation(out) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_inspect_gcs_file_with_custom_info_types( bucket, topic_id, subscription_id, capsys): try: @@ -331,15 +335,16 @@ def test_inspect_gcs_file_with_custom_info_types( [], custom_dictionaries=dictionaries, custom_regexes=regexes, - timeout=1) + timeout=TIMEOUT) out, _ = capsys.readouterr() - assert "Inspection operation started" in out + assert "Info type: EMAIL_ADDRESS" in out finally: cancel_operation(out) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_inspect_gcs_file_no_results( bucket, topic_id, subscription_id, capsys): try: @@ -350,15 +355,16 @@ def test_inspect_gcs_file_no_results( topic_id, subscription_id, ["EMAIL_ADDRESS", "PHONE_NUMBER"], - timeout=1) + timeout=TIMEOUT) out, _ = capsys.readouterr() - assert "Inspection operation started" in out + assert "No findings" in out finally: cancel_operation(out) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_inspect_gcs_image_file(bucket, topic_id, subscription_id, capsys): try: inspect_content.inspect_gcs_file( @@ -368,14 +374,15 @@ def test_inspect_gcs_image_file(bucket, topic_id, subscription_id, capsys): topic_id, subscription_id, ["EMAIL_ADDRESS", "PHONE_NUMBER"], - timeout=1) + timeout=TIMEOUT) out, _ = capsys.readouterr() - assert "Inspection operation started" in out + assert "Info type: EMAIL_ADDRESS" in out finally: cancel_operation(out) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_inspect_gcs_multiple_files(bucket, topic_id, subscription_id, capsys): try: inspect_content.inspect_gcs_file( @@ -385,15 +392,16 @@ def test_inspect_gcs_multiple_files(bucket, topic_id, subscription_id, capsys): topic_id, subscription_id, ["EMAIL_ADDRESS", "PHONE_NUMBER"], - timeout=1) + timeout=TIMEOUT) out, _ = capsys.readouterr() - assert "Inspection operation started" in out + assert "Info type: EMAIL_ADDRESS" in out finally: cancel_operation(out) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_inspect_datastore( datastore_project, topic_id, subscription_id, capsys): try: @@ -404,14 +412,15 @@ def test_inspect_datastore( topic_id, subscription_id, ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], - timeout=1) + timeout=TIMEOUT) out, _ = capsys.readouterr() - assert "Inspection operation started" in out + assert "Info type: EMAIL_ADDRESS" in out finally: cancel_operation(out) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_inspect_datastore_no_results( datastore_project, topic_id, subscription_id, capsys): try: @@ -422,10 +431,10 @@ def test_inspect_datastore_no_results( topic_id, subscription_id, ["PHONE_NUMBER"], - timeout=1) + timeout=TIMEOUT) out, _ = capsys.readouterr() - assert "Inspection operation started" in out + assert "No findings" in out finally: cancel_operation(out) diff --git a/dlp/risk.py b/dlp/risk.py index a31dfb12c6ef..518f947eee6b 100644 --- a/dlp/risk.py +++ b/dlp/risk.py @@ -59,8 +59,9 @@ def numerical_risk_analysis( # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() - # Convert the project id into a full resource id. - parent = dlp.project_path(project) + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') # Location info of the BigQuery table. source_table = { @@ -70,7 +71,7 @@ def numerical_risk_analysis( } # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] + actions = [{"pub_sub": {"topic": topic}}] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for @@ -169,8 +170,9 @@ def categorical_risk_analysis( # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() - # Convert the project id into a full resource id. - parent = dlp.project_path(project) + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') # Location info of the BigQuery table. source_table = { @@ -180,7 +182,7 @@ def categorical_risk_analysis( } # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] + actions = [{"pub_sub": {"topic": topic}}] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for @@ -294,7 +296,8 @@ def get_values(obj): dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. - parent = dlp.project_path(project) + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') # Location info of the BigQuery table. source_table = { @@ -310,7 +313,7 @@ def map_fields(field): quasi_ids = map(map_fields, quasi_ids) # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] + actions = [{"pub_sub": {"topic": topic}}] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for @@ -425,7 +428,8 @@ def get_values(obj): dlp = google.cloud.dlp_v2.DlpServiceClient() # Convert the project id into a full resource id. - parent = dlp.project_path(project) + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') # Location info of the BigQuery table. source_table = { @@ -441,7 +445,7 @@ def map_fields(field): quasi_ids = map(map_fields, quasi_ids) # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] + actions = [{"pub_sub": {"topic": topic}}] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for @@ -574,8 +578,9 @@ def get_values(obj): # Instantiate a client. dlp = google.cloud.dlp_v2.DlpServiceClient() - # Convert the project id into a full resource id. - parent = dlp.project_path(project) + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') # Location info of the BigQuery table. source_table = { @@ -598,7 +603,7 @@ def map_fields(quasi_id, info_type): quasi_ids = map(map_fields, quasi_ids, info_types) # Tell the API where to send a notification when the job is complete. - actions = [{"pub_sub": {"topic": "{}/topics/{}".format(parent, topic_id)}}] + actions = [{"pub_sub": {"topic": topic}}] # Configure risk analysis job # Give the name of the numeric column to compute risk metrics for diff --git a/dlp/risk_test.py b/dlp/risk_test.py index 0164cf3b8c0e..36f7f54a0951 100644 --- a/dlp/risk_test.py +++ b/dlp/risk_test.py @@ -13,7 +13,6 @@ # limitations under the License. import os -import time import uuid import google.cloud.bigquery @@ -37,14 +36,14 @@ BIGQUERY_TABLE_ID = "dlp_test_table" + UNIQUE_STRING BIGQUERY_HARMFUL_TABLE_ID = "harmful" + UNIQUE_STRING -TIMEOUT = 30 +TIMEOUT = 60 # 1 minutes # Create new custom topic/subscription # We observe sometimes all the tests in this file fail. In a # hypothesis where DLP service somehow loses the connection to the # topic, now we use function scope for Pub/Sub fixtures. -@pytest.fixture(scope="function") +@pytest.fixture(scope="module") def topic_id(): # Creates a pubsub topic, and tears it down. publisher = google.cloud.pubsub.PublisherClient() @@ -59,7 +58,7 @@ def topic_id(): publisher.delete_topic(topic_path) -@pytest.fixture(scope="function") +@pytest.fixture(scope="module") def subscription_id(topic_id): # Subscribes to a topic. subscriber = google.cloud.pubsub.SubscriberClient() @@ -166,22 +165,7 @@ def bigquery_project(): bigquery_client.delete_dataset(dataset_ref, delete_contents=True) -def delay(err, *args): - # 20 mins of delay. This sounds like too long a delay, but we - # occasionally observe consequtive time block where operations are - # slow which leads to the test failures. These situations tend to - # get self healed in 20 minutes or so, so I'm trying this strategy. - # - # There are 10 tests, so we don't want the retry delay happening - # for all the tests. When we exhaust the MAX_FLAKY_WAIT, we retry - # the test immediately. - wait_time = min(pytest.MAX_FLAKY_WAIT, 60*20) - pytest.MAX_FLAKY_WAIT -= wait_time - time.sleep(wait_time) - return True - - -@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_numerical_risk_analysis( topic_id, subscription_id, bigquery_project, capsys ): @@ -200,7 +184,7 @@ def test_numerical_risk_analysis( assert "Value Range:" in out -@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_categorical_risk_analysis_on_string_field( topic_id, subscription_id, bigquery_project, capsys ): @@ -219,7 +203,7 @@ def test_categorical_risk_analysis_on_string_field( assert "Most common value occurs" in out -@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_categorical_risk_analysis_on_number_field( topic_id, subscription_id, bigquery_project, capsys ): @@ -238,7 +222,7 @@ def test_categorical_risk_analysis_on_number_field( assert "Most common value occurs" in out -@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_k_anonymity_analysis_single_field( topic_id, subscription_id, bigquery_project, capsys ): @@ -258,7 +242,7 @@ def test_k_anonymity_analysis_single_field( assert "Class size:" in out -@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_k_anonymity_analysis_multiple_fields( topic_id, subscription_id, bigquery_project, capsys ): @@ -278,7 +262,7 @@ def test_k_anonymity_analysis_multiple_fields( assert "Class size:" in out -@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_l_diversity_analysis_single_field( topic_id, subscription_id, bigquery_project, capsys ): @@ -300,7 +284,7 @@ def test_l_diversity_analysis_single_field( assert "Sensitive value" in out -@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_l_diversity_analysis_multiple_field( topic_id, subscription_id, bigquery_project, capsys ): @@ -322,7 +306,7 @@ def test_l_diversity_analysis_multiple_field( assert "Sensitive value" in out -@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_k_map_estimate_analysis_single_field( topic_id, subscription_id, bigquery_project, capsys ): @@ -344,7 +328,7 @@ def test_k_map_estimate_analysis_single_field( assert "Values" in out -@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_k_map_estimate_analysis_multiple_field( topic_id, subscription_id, bigquery_project, capsys ): @@ -366,7 +350,7 @@ def test_k_map_estimate_analysis_multiple_field( assert "Values" in out -@pytest.mark.flaky(max_runs=2, min_passes=1, rerun_filter=delay) +@pytest.mark.flaky(max_runs=2, min_passes=1) def test_k_map_estimate_analysis_quasi_ids_info_types_equal( topic_id, subscription_id, bigquery_project ): diff --git a/scripts/readme-gen/templates/README.tmpl.rst b/scripts/readme-gen/templates/README.tmpl.rst index 30ad03d050d8..1d0432d0d927 100644 --- a/scripts/readme-gen/templates/README.tmpl.rst +++ b/scripts/readme-gen/templates/README.tmpl.rst @@ -23,6 +23,13 @@ To run the sample, you need to enable the API at: {{required_api_url}} To run the sample, you need to have `{{required_role}}` role. {% endif %} +{% if required_roles %} +To run the sample, you need to have the following roles: +{% for role in required_roles %} +* `{{role}}` +{% endfor %} +{% endif %} + {{other_required_steps}} {% if setup %}