diff --git a/README.md b/README.md index d05d1447..979a4aa1 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,10 @@ [![Documentation Status](https://readthedocs.org/projects/gretel-client/badge/?version=latest)](https://gretel-client.readthedocs.io/en/stable/?badge=stable) +Check out our [documentation](https://gretel-client.readthedocs.io/en/stable/?badge=stable) for getting started guides and module references. + +For more advanced usage, please refer to our [blueprints](blueprints). + The Gretel Python Client provides bindings to the Gretel REST API and a transformation sub-package that provides interfaces to manipulate data based on a variety of use cases. The REST API bindings and transformer interfaces can be used separately or together to solve a variety of data analysis, anonymization, and other ETL use cases. diff --git a/notebooks/launch_transformers.ipynb b/notebooks/launch_transformers.ipynb index 276f0050..6062218e 100644 --- a/notebooks/launch_transformers.ipynb +++ b/notebooks/launch_transformers.ipynb @@ -8,11 +8,13 @@ "\n", "Welcome to the Gretel Transformers walkthrough! In this tutorial we will take you through the process of creating a data pipeline to apply a variety of transformations to your data.\n", "\n", - "This tutorial assumes you have already uploaded data to Gretel.\n", + "This tutorial assumes you have already uploaded data to a [Gretel Project](https://console.gretel.cloud).\n", "\n", "The transformers in this example work on entity labels only. We have chosen a subset of labels we see often in data.\n", "\n", - "If you would like to build field-level transforms, please look through our blueprints directory (in the top level of the repository) for examples." + "If you would like to build field-level transforms or see more advanced use cases please look through our [blueprints directory](https://github.com/gretelai/gretel-python-client/tree/master/blueprints) for more examples.\n", + "\n", + "For a more exhaustive list of possible transformations, please reference our [documentation](https://gretel-client.readthedocs.io/en/latest/transformers/api_ref.html#module-reference-transformers)." ] }, { @@ -25,6 +27,8 @@ }, "outputs": [], "source": [ + "# NOTE: Run this cell and copy your Gretel URI into the text box below\n", + "\n", "import getpass\n", "import os\n", "\n", @@ -37,7 +41,8 @@ "source": [ "## Create a Gretel Project Instance\n", "\n", - "In the code below, we will utilize the gretel-client to create an instance of a project that will be used to syntesize data from. " + "In the code below, we will utilize the gretel-client to create an instance of a Project that we can use to iterate\n", + "labeled records from." ] }, { @@ -49,7 +54,7 @@ "outputs": [], "source": [ "%%capture\n", - "!pip install \"gretel-client==0.7.0.rc7\" --upgrade" + "!pip install gretel-client --upgrade" ] }, { @@ -73,14 +78,14 @@ "metadata": {}, "outputs": [], "source": [ - "# We can see how many records we've ingested and how many fields we've discovered, just to show the\n", - "# project is active.\n", - "print(f'Total Records Received: {project.record_count}')\n", - "print(f'Total Fields Discovered: {project.field_count}')\n", - "\n", - "print(\"\")\n", - "print('Previewing project dataframe')\n", - "project.head(5)" + "# Example JSON record and Gretel Metadata from the Project stream\n", + "\n", + "# Components of a record:\n", + "# - id: A unique ID that represents a position in the stream the record resides\n", + "# - data: A flattened version of the raw record that was received\n", + "# - metadata: A dictionary of metadata, keyed by field name\n", + "\n", + "project.sample()[0]" ] }, { @@ -117,6 +122,9 @@ "email_mask = StringMask(start_pos=3)\n", "email_transformer = [RedactWithCharConfig(labels=[\"email_address\"], minimum_score=Score.MED, mask=[email_mask])]\n", "\n", + "ip_mask = StringMask(start_pos=-6)\n", + "ip_transformer = [RedactWithCharConfig(labels=[\"ip_address\"], minimum_score=Score.MED, mask=[ip_mask])]\n", + "\n", "# let's mask the last 2 digits of zip codes\n", "zip_mask = StringMask(start_pos=-2)\n", "zip_transformer = [RedactWithCharConfig(labels=[\"us_zip_code\"], minimum_score=Score.MED, mask=[zip_mask])]\n", @@ -129,6 +137,9 @@ "# let's replace phone numbers with totally fake, but consistent ones\n", "phone_transformer = [FakeConstantConfig(labels=[\"phone_number\"], minimum_score=Score.MED, seed=1234, fake_method=\"phone_number\")]\n", "\n", + "# let's replace person names with totally fake, but consistent ones\n", + "person_transformer = [FakeConstantConfig(labels=[\"person_name\"], minimum_score=Score.MED, seed=1234, fake_method=\"person_name\")]\n", + "\n", "# aggressively mask all locations\n", "location_transformer = [RedactWithLabelConfig(labels=[\"location\"], minimum_score=Score.MED)]\n", "\n", @@ -140,7 +151,7 @@ "# since we are only working on automatic transforms based on labels\n", "# they can all go into one datapath\n", "\n", - "all_transformers = email_transformer + zip_transformer + token_transformer + phone_transformer + location_transformer + lat_lon_transformer\n", + "all_transformers = email_transformer + ip_transformer + zip_transformer + token_transformer + phone_transformer + person_transformer + location_transformer + lat_lon_transformer\n", "data_path = [\n", " DataPath(input=\"*\", xforms=all_transformers)\n", "]\n", @@ -194,7 +205,7 @@ "# Print out Git-style diffs between source and transformed records\n", "for original, transformed in zip(records, transformed_records):\n", " show_record_diff(original[\"data\"], transformed[\"data\"])\n", - " input()" + " input(\"Press enter / return to go to the next record\")" ] }, { @@ -202,7 +213,17 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# If you have data constantly ingesting to the Gretel API, you can consume the labeled\n", + "# data and automatically apply your transforms like so:\n", + "#\n", + "# NOTE: If you do not have data ingesting currently, this operation will block until records are received\n", + "#\n", + "for record in project.iter_records():\n", + " # from here you may route your transformed records to anywhere!\n", + " transformed = pipeline.transform_record(record)\n", + " print(transformed[\"record\"])" + ] } ], "metadata": { @@ -226,4 +247,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/setup.py b/setup.py index cab7bbee..11501d0f 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ 'dataclasses;python_version<"3.7"' ], extras_require={ - "pandas": ["pandas==1.0.3"], + "pandas": ["pandas>1.0.0,<1.1.0"], "fpe": ["numpy", "pycryptodome==3.9.8", "dateparser==0.7.6"] }, ) diff --git a/transformers.md b/transformers.md index 85b0738e..0ad3a0d7 100644 --- a/transformers.md +++ b/transformers.md @@ -3,7 +3,7 @@ Welcome to the Gretel Transformers documentation! Here we will introduce you to the concepts in the Transformers sub-package and provide some basic tutorials for getting started. -For more advanced usage, please refer to our tutorials / guides on [our blog](https://www.medium.com/gretel-ai). +For more advanced usage, please refer to our [blueprints](https://github.com/gretelai/gretel-python-client/tree/master/blueprints). ## Installation