Skip to content

Commit

Permalink
smtp authentication
Browse files Browse the repository at this point in the history
  • Loading branch information
Yevheniia Nikonchuk authored and Yevheniia Nikonchuk committed Dec 19, 2024
1 parent 5c7be0a commit 52cc1d3
Show file tree
Hide file tree
Showing 15 changed files with 532 additions and 61 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ se_user_conf = {
#Below two params are optional and need to be enabled to pass the custom email body
#user_config.se_notifications_enable_custom_email_body: True,
#user_config.se_notifications_email_custom_body: "Custom statistics: 'product_id': {}",

#Below parameter is optional and need to be enabled in case authorization is required to access smtp server.
#user_config.se_notifications_email_smtp_auth: True,
}
```

Expand Down
16 changes: 15 additions & 1 deletion docs/bigquery.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,19 @@ writer = (
.option("writeMethod", "direct")
)

#if smtp server needs to be authenticated, password should be set in secure way like cerberus or databricks secret
stats_streaming_config_dict = {
user_config.se_enable_streaming: False,
user_config.secret_type: "cerberus",
user_config.cbs_url: "htpps://cerberus.example.com",
user_config.cbs_sdb_path: "",
user_config.cbs_smtp_password: "",
# user_config.secret_type: "databricks",
# user_config.dbx_workspace_url: "https://workspace.cloud.databricks.com",
# user_config.dbx_secret_scope: "your_secret_scope",
# user_config.dbx_smtp_password: "your_password",
}

se: SparkExpectations = SparkExpectations(
product_id="your_product",
rules_df=spark.read.format("bigquery").load(
Expand All @@ -52,13 +65,14 @@ se: SparkExpectations = SparkExpectations(
stats_table_writer=writer,
target_and_error_table_writer=writer,
debugger=False,
stats_streaming_options={user_config.se_enable_streaming: False}
stats_streaming_options=stats_streaming_config_dict,
)


# Commented fields are optional or required when notifications are enabled
user_conf = {
user_config.se_notifications_enable_email: False,
# user_config.se_notifications_enable_smtp_server_auth: False,
# user_config.se_notifications_enable_custom_email_body: True,
# user_config.se_notifications_email_smtp_host: "mailhost.com",
# user_config.se_notifications_email_smtp_port: 25,
Expand Down
16 changes: 15 additions & 1 deletion docs/delta.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,19 @@ from spark_expectations.core.expectations import (
)
from spark_expectations.config.user_config import Constants as user_config

# if smtp server needs to be authenticated, password should be set in secure way like cerberus or databricks secret
stats_streaming_config_dict = {
user_config.se_enable_streaming: False,
user_config.secret_type: "cerberus",
user_config.cbs_url: "htpps://cerberus.example.com",
user_config.cbs_sdb_path: "",
user_config.cbs_smtp_password: "",
# user_config.secret_type: "databricks",
# user_config.dbx_workspace_url: "https://workspace.cloud.databricks.com",
# user_config.dbx_secret_scope: "your_secret_scope",
# user_config.dbx_smtp_password: "your_password",
}

writer = WrappedDataFrameWriter().mode("append").format("delta")

se: SparkExpectations = SparkExpectations(
Expand All @@ -43,12 +56,13 @@ se: SparkExpectations = SparkExpectations(
stats_table_writer=writer,
target_and_error_table_writer=writer,
debugger=False,
stats_streaming_options={user_config.se_enable_streaming: False}
stats_streaming_options=stats_streaming_config_dict
)

# Commented fields are optional or required when notifications are enabled
user_conf = {
user_config.se_notifications_enable_email: False,
# user_config.se_notifications_enable_smtp_server_auth: False,
# user_config.se_notifications_enable_custom_email_body: True,
# user_config.se_notifications_email_smtp_host: "mailhost.com",
# user_config.se_notifications_email_smtp_port: 25,
Expand Down
88 changes: 49 additions & 39 deletions docs/examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,52 +8,54 @@ from spark_expectations.config.user_config import Constants as user_config

se_user_conf = {
user_config.se_notifications_enable_email: False, # (1)!
user_config.se_notifications_enable_custom_email_body: False, # (2)
user_config.se_notifications_email_smtp_host: "mailhost.com", # (3)!
user_config.se_notifications_email_smtp_port: 25, # (4)!
user_config.se_notifications_email_from: "<sender_email_id>", # (5)!
user_config.se_notifications_email_to_other_mail_id: "<receiver_email_id's>", # (6)!
user_config.se_notifications_email_subject: "spark expectations - data quality - notifications", # (7)!
user_config.se_notifications_email_custom_body: "custom stats: 'product_id': {}", # (8)!
user_config.se_notifications_enable_slack: True, # (9)!
user_config.se_notifications_slack_webhook_url: "<slack-webhook-url>", # (10)!
user_config.se_notifications_on_start: True, # (11)!
user_config.se_notifications_on_completion: True, # (12)!
user_config.se_notifications_on_fail: True, # (13)!
user_config.se_notifications_on_error_drop_exceeds_threshold_breach: True, # (14)!
user_config.se_notifications_on_error_drop_threshold: 15, # (15)!
user_config.se_enable_error_table: True, # (16)!
user_config.enable_query_dq_detailed_result: True, # (17)!
user_config.enable_agg_dq_detailed_result: True, # (18)!
user_config.querydq_output_custom_table_name: "<catalog.schema.table-name>", #19
user_config.se_notifications_enable_smtp_server_auth: False, # (2)!
user_config.se_notifications_enable_custom_email_body: False, # (3)
user_config.se_notifications_email_smtp_host: "mailhost.com", # (4)!
user_config.se_notifications_email_smtp_port: 25, # (5)!
user_config.se_notifications_email_from: "<sender_email_id>", # (6)!
user_config.se_notifications_email_to_other_mail_id: "<receiver_email_id's>", # (7)!
user_config.se_notifications_email_subject: "spark expectations - data quality - notifications", # (8)!
user_config.se_notifications_email_custom_body: "custom stats: 'product_id': {}", # (9)!
user_config.se_notifications_enable_slack: True, # (10)!
user_config.se_notifications_slack_webhook_url: "<slack-webhook-url>", # (11)!
user_config.se_notifications_on_start: True, # (12)!
user_config.se_notifications_on_completion: True, # (13)!
user_config.se_notifications_on_fail: True, # (14)!
user_config.se_notifications_on_error_drop_exceeds_threshold_breach: True, # (15)!
user_config.se_notifications_on_error_drop_threshold: 15, # (16)!
user_config.se_enable_error_table: True, # (17)!
user_config.enable_query_dq_detailed_result: True, # (18)!
user_config.enable_agg_dq_detailed_result: True, # (19)!
user_config.querydq_output_custom_table_name: "<catalog.schema.table-name>", #20
user_config.se_dq_rules_params: {
"env": "local",
"table": "product",
}, # (20)!
}, # (21)!
}
}
```

1. The `user_config.se_notifications_enable_email` parameter, which controls whether notifications are sent via email, is set to false by default
2. The `user_config.se_notifications_enable_custom_email_body` optional parameter, which controls whether custom email body is enabled, is set to false by default
3. The `user_config.se_notifications_email_smtp_host` parameter is set to "mailhost.com" by default and is used to specify the email SMTP domain host
4. The `user_config.se_notifications_email_smtp_port` parameter, which accepts a port number, is set to "25" by default
5. The `user_config.se_notifications_email_from` parameter is used to specify the email ID that will trigger the email notification
6. The `user_config.se_notifications_email_to_other_mail_id` parameter accepts a list of recipient email IDs
7. The `user_config.se_notifications_email_subject` parameter captures the subject line of the email
8. The `user_config.se_notifications_email_custom_body` optional parameter, captures the custom email body, need to be compliant with certain syntax
9. The `user_config.se_notifications_enable_slack` parameter, which controls whether notifications are sent via slack, is set to false by default
10. The `user_config/se_notifications_slack_webhook_url` parameter accepts the webhook URL of a Slack channel for sending notifications
11. When `user_config.se_notifications_on_start` parameter set to `True` enables notification on start of the spark-expectations, variable by default set to `False`
12. When `user_config.se_notifications_on_completion` parameter set to `True` enables notification on completion of spark-expectations framework, variable by default set to `False`
13. When `user_config.se_notifications_on_fail` parameter set to `True` enables notification on failure of spark-expectations data quality framework, variable by default set to `True`
14. When `user_config.se_notifications_on_error_drop_exceeds_threshold_breach` parameter set to `True` enables notification when error threshold reaches above the configured value
15. The `user_config.se_notifications_on_error_drop_threshold` parameter captures error drop threshold value
16. The `user_config.se_enable_error_table` parameter, which controls whether error data to load into error table, is set to true by default
17. When `user_config.enable_query_dq_detailed_result` parameter set to `True`, enables the option to cature the query_dq detailed stats to detailed_stats table. By default set to `False`
18. When `user_config.enable_agg_dq_detailed_result` parameter set to `True`, enables the option to cature the agg_dq detailed stats to detailed_stats table. By default set to `False`
19. The `user_config.querydq_output_custom_table_name` parameter is used to specify the name of the custom query_dq output table which captures the output of the alias queries passed in the query dq expectation. Default is <stats_table>_custom_output
20. The `user_config.se_dq_rules_params` parameter, which are required to dynamically update dq rules
2. The `user_config.se_notifications_enable_smtp_server_auth` optional parameter, which controls whether SMTP server authentication is enabled, is set to false by default
3. The `user_config.se_notifications_enable_custom_email_body` optional parameter, which controls whether custom email body is enabled, is set to false by default
4. The `user_config.se_notifications_email_smtp_host` parameter is set to "mailhost.com" by default and is used to specify the email SMTP domain host
5. The `user_config.se_notifications_email_smtp_port` parameter, which accepts a port number, is set to "25" by default
6. The `user_config.se_notifications_email_from` parameter is used to specify the email ID that will trigger the email notification
7. The `user_config.se_notifications_email_to_other_mail_id` parameter accepts a list of recipient email IDs
8. The `user_config.se_notifications_email_subject` parameter captures the subject line of the email
9. The `user_config.se_notifications_email_custom_body` optional parameter, captures the custom email body, need to be compliant with certain syntax
10. The `user_config.se_notifications_enable_slack` parameter, which controls whether notifications are sent via slack, is set to false by default
11. The `user_config/se_notifications_slack_webhook_url` parameter accepts the webhook URL of a Slack channel for sending notifications
12. When `user_config.se_notifications_on_start` parameter set to `True` enables notification on start of the spark-expectations, variable by default set to `False`
13. When `user_config.se_notifications_on_completion` parameter set to `True` enables notification on completion of spark-expectations framework, variable by default set to `False`
14. When `user_config.se_notifications_on_fail` parameter set to `True` enables notification on failure of spark-expectations data quality framework, variable by default set to `True`
15. When `user_config.se_notifications_on_error_drop_exceeds_threshold_breach` parameter set to `True` enables notification when error threshold reaches above the configured value
16. The `user_config.se_notifications_on_error_drop_threshold` parameter captures error drop threshold value
17. The `user_config.se_enable_error_table` parameter, which controls whether error data to load into error table, is set to true by default
18. When `user_config.enable_query_dq_detailed_result` parameter set to `True`, enables the option to cature the query_dq detailed stats to detailed_stats table. By default set to `False`
19. When `user_config.enable_agg_dq_detailed_result` parameter set to `True`, enables the option to cature the agg_dq detailed stats to detailed_stats table. By default set to `False`
20. The `user_config.querydq_output_custom_table_name` parameter is used to specify the name of the custom query_dq output table which captures the output of the alias queries passed in the query dq expectation. Default is <stats_table>_custom_output
21. The `user_config.se_dq_rules_params` parameter, which are required to dynamically update dq rules


### Spark Expectations Initialization
Expand All @@ -62,7 +64,8 @@ For all the below examples the below import and SparkExpectations class instanti

When store for sensitive details is Databricks secret scope,construct config dictionary for authentication of Kafka and
avoid duplicate construction every time your project is initialized, you can create a dictionary with the following keys and their appropriate values.
This dictionary can be placed in the __init__.py file of your project or declared as a global variable.
This dictionary can be placed in the __init__.py file of your project or declared as a global variable. In case you need authentication for smtp server,
you can store password in Databricks secret scope as well, or choose Cerberus for this secret storage.
```python
from typing import Dict, Union
from spark_expectations.config.user_config import Constants as user_config
Expand All @@ -77,6 +80,7 @@ stats_streaming_config_dict: Dict[str, Union[bool, str]] = {
user_config.dbx_secret_app_name: "se_streaming_auth_secret_appid_key", # (7)!
user_config.dbx_secret_token: "se_streaming_auth_secret_token_key", # (8)!
user_config.dbx_topic_name: "se_streaming_topic_name", # (9)!
user_config.dbx_smtp_password: "smtp_password_secret_key", # (10)!
}
```

Expand All @@ -89,6 +93,7 @@ stats_streaming_config_dict: Dict[str, Union[bool, str]] = {
7. The `user_config.dbx_secret_app_name` captures secret key for the Kafka authentication app name
8. The `user_config.dbx_secret_token` captures secret key for the Kafka authentication app secret token
9. The `user_config.dbx_topic_name` captures secret key for the Kafka topic name
10. The `user_config.dbx_smtp_password` captures secret key for the SMTP password

Similarly when sensitive store is Cerberus:

Expand All @@ -106,6 +111,7 @@ stats_streaming_config_dict: Dict[str, Union[bool, str]] = {
user_config.cbs_secret_app_name: "se_streaming_auth_secret_appid_sdb_path", # (7)!
user_config.cbs_secret_token: "se_streaming_auth_secret_token_sdb_path", # (8)!
user_config.cbs_topic_name: "se_streaming_topic_name_sdb_path", # (9)!
user_config.cbs_smtp_password: "smtp_password_secret_key", # (10)!
}
```

Expand All @@ -118,8 +124,12 @@ stats_streaming_config_dict: Dict[str, Union[bool, str]] = {
7. The `user_config.cbs_secret_app_name` captures path where Kafka authentication app name stored in the Cerberus sdb
8. The `user_config.cbs_secret_token` captures path where Kafka authentication app name secret token stored in the Cerberus sdb
9. The `user_config.cbs_topic_name` captures path where Kafka topic name stored in the Cerberus sdb
10. The `user_config.cbs_smtp_password` captures key for the SMTP password

```python

You can disable the streaming functionality by setting the `user_config.se_enable_streaming` parameter to `False`
You can stil pass the secret keys for smtp password, even if streaming is disabled.

```python
from typing import Dict, Union
Expand Down
16 changes: 15 additions & 1 deletion docs/iceberg.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,33 @@ from spark_expectations.config.user_config import Constants as user_config

writer = WrappedDataFrameWriter().mode("append").format("iceberg")

# if smtp server needs to be authenticated, password should be set in secure way like cerberus or databricks secret
stats_streaming_config_dict = {
user_config.se_enable_streaming: False,
user_config.secret_type: "cerberus",
user_config.cbs_url: "htpps://cerberus.example.com",
user_config.cbs_sdb_path: "",
user_config.cbs_smtp_password: "",
# user_config.secret_type: "databricks",
# user_config.dbx_workspace_url: "https://workspace.cloud.databricks.com",
# user_config.dbx_secret_scope: "your_secret_scope",
# user_config.dbx_smtp_password: "your_password",
}

se: SparkExpectations = SparkExpectations(
product_id="your_product",
rules_df=spark.sql("select * from dq_spark_local.dq_rules"),
stats_table="dq_spark_local.dq_stats",
stats_table_writer=writer,
target_and_error_table_writer=writer,
debugger=False,
stats_streaming_options={user_config.se_enable_streaming: False},
stats_streaming_options=stats_streaming_config_dict,
)

# Commented fields are optional or required when notifications are enabled
user_conf = {
user_config.se_notifications_enable_email: False,
# user_config.se_notifications_enable_smtp_server_auth: False,
# user_config.se_notifications_enable_custom_email_body: True,
# user_config.se_notifications_email_smtp_host: "mailhost.com",
# user_config.se_notifications_email_smtp_port: 25,
Expand Down
5 changes: 5 additions & 0 deletions spark_expectations/config/user_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
class Constants:
# declare const user config variables for email notification
se_notifications_enable_email = "spark.expectations.notifications.email.enabled"
se_notifications_enable_smtp_server_auth = (
"spark.expectations.notifications.email.smtp_server_auth"
)
se_notifications_enable_custom_email_body = (
"spark.expectations.notifications.enable.custom.email.body"
)
Expand Down Expand Up @@ -65,6 +68,7 @@ class Constants:
cbs_secret_app_name = "se.streaming.cbs.secret.app.name"
cbs_secret_token = "se.streaming.cerberus.secret.token"
cbs_topic_name = "se.streaming.cerberus.token.name"
cbs_smtp_password = "se.streaming.cerberus.smtp.password"

dbx_workspace_url = "se.streaming.dbx.workspace.url"
dbx_secret_scope = "se.streaming.dbx.secret.scope"
Expand All @@ -73,6 +77,7 @@ class Constants:
dbx_secret_app_name = "se.streaming.dbx.secret.app.name"
dbx_secret_token = "se.streaming.dbx.secret.token"
dbx_topic_name = "se.streaming.dbx.topic.name"
dbx_smtp_password = "se.streaming.dbx.smtp.password"

# declare const user config variables for agg query dq detailed stats
se_enable_agg_dq_detailed_result = "spark.expectations.agg.dq.detailed.stats"
Expand Down
Loading

0 comments on commit 52cc1d3

Please sign in to comment.