Skip to content

Commit

Permalink
community[minor]: [Pebblo] Enhance PebbloSafeLoader to take anonymize…
Browse files Browse the repository at this point in the history
… flag (langchain-ai#26812)

- **Description:** The flag is named `anonymize_snippets`. When set to
true, the Pebblo server will anonymize snippets by redacting all
personally identifiable information (PII) from the snippets going into
VectorDB and the generated reports
- **Issue:** NA
- **Dependencies:** NA
- **docs**: Updated
  • Loading branch information
Raj725 authored and Sheepsta300 committed Oct 1, 2024
1 parent 9288578 commit d301339
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 0 deletions.
33 changes: 33 additions & 0 deletions docs/docs/integrations/document_loaders/pebblo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,39 @@
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Anonymize the snippets to redact all PII details\n",
"\n",
"Set `anonymize_snippets` to `True` to anonymize all personally identifiable information (PII) from the snippets going into VectorDB and the generated reports.\n",
"\n",
"> Note: The _Pebblo Entity Classifier_ effectively identifies personally identifiable information (PII) and is continuously evolving. While its recall is not yet 100%, it is steadily improving.\n",
"> For more details, please refer to the [_Pebblo Entity Classifier docs_](https://daxa-ai.github.io/pebblo/entityclassifier/)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import CSVLoader, PebbloSafeLoader\n",
"\n",
"loader = PebbloSafeLoader(\n",
" CSVLoader(\"data/corp_sens_data.csv\"),\n",
" name=\"acme-corp-rag-1\", # App name (Mandatory)\n",
" owner=\"Joe Smith\", # Owner (Optional)\n",
" description=\"Support productivity RAG application\", # Description (Optional)\n",
" anonymize_snippets=True, # Whether to anonymize entities in the PDF Report (Optional, default=False)\n",
")\n",
"documents = loader.load()\n",
"print(documents[0].metadata)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
Expand Down
2 changes: 2 additions & 0 deletions libs/community/langchain_community/document_loaders/pebblo.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def __init__(
classifier_url: Optional[str] = None,
*,
classifier_location: str = "local",
anonymize_snippets: bool = False,
):
if not name or not isinstance(name, str):
raise NameError("Must specify a valid name.")
Expand Down Expand Up @@ -78,6 +79,7 @@ def __init__(
api_key=api_key,
classifier_location=classifier_location,
classifier_url=classifier_url,
anonymize_snippets=anonymize_snippets,
)
self.pb_client.send_loader_discover(self.app)

Expand Down
7 changes: 7 additions & 0 deletions libs/community/langchain_community/utilities/pebblo.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ class Doc(BaseModel):
"""Owner of the source of the loader."""
classifier_location: str
"""Location of the classifier."""
anonymize_snippets: bool
"""Whether to anonymize snippets going into VectorDB and the generated reports"""


def get_full_path(path: str) -> str:
Expand Down Expand Up @@ -424,6 +426,8 @@ class PebbloLoaderAPIWrapper(BaseModel):
"""URL of the Pebblo Classifier"""
cloud_url: Optional[str]
"""URL of the Pebblo Cloud"""
anonymize_snippets: bool = False
"""Whether to anonymize snippets going into VectorDB and the generated reports"""

def __init__(self, **kwargs: Any):
"""Validate that api key in environment."""
Expand Down Expand Up @@ -522,6 +526,8 @@ def classify_documents(
# If local classifier is used add the classified information
# and remove doc content
self.update_doc_data(payload["docs"], classified_docs)
# Remove the anonymize_snippets key from payload
payload.pop("anonymize_snippets", None)
self.send_docs_to_pebblo_cloud(payload)
elif self.classifier_location == "pebblo-cloud":
logger.warning("API key is missing for sending docs to Pebblo cloud.")
Expand Down Expand Up @@ -599,6 +605,7 @@ def build_classification_payload(
"loading_end": "false",
"source_owner": source_owner,
"classifier_location": self.classifier_location,
"anonymize_snippets": self.anonymize_snippets,
}
if loading_end is True:
payload["loading_end"] = "true"
Expand Down

0 comments on commit d301339

Please sign in to comment.