Skip to content

Commit

Permalink
[MC-1458] Add newtab_merino_priors DAG (#6303)
Browse files Browse the repository at this point in the history
* [MC-1458] Add newtab_merino_priors DAG

* Extract shared JSON export function

---------

Co-authored-by: Chelsey Beck <64881557+chelseybeck@users.noreply.github.com>
  • Loading branch information
mmiermans and chelseybeck authored Oct 14, 2024
1 parent fe03467 commit 5e08071
Show file tree
Hide file tree
Showing 10 changed files with 418 additions and 126 deletions.
131 changes: 131 additions & 0 deletions bigquery_etl/newtab_merino/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""Extract query results and write the combined JSON to a single file."""

import json
import logging
from datetime import datetime, timedelta, timezone

import rich_click as click
from google.cloud import storage # type: ignore
from google.cloud import bigquery

logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)


@click.command()
@click.option(
"--source-project",
required=True,
help="Google Cloud Project where the source table is located.",
)
@click.option(
"--source-dataset",
required=True,
help="Dataset in BigQuery where the source table is located.",
)
@click.option(
"--source-table", required=True, help="Name of the source table in BigQuery."
)
@click.option(
"--destination-bucket",
required=True,
help="Destination Google Cloud Storage Bucket.",
)
@click.option(
"--destination-prefix", required=True, help="Prefix of the bucket path in GCS."
)
@click.option(
"--destination-prefix", required=True, help="Prefix of the bucket path in GCS."
)
@click.option(
"--deletion-days-old",
required=True,
type=int,
help="Number of days after which files in GCS should be deleted.",
)
def export_newtab_merino_table_to_gcs(
source_project: str,
source_dataset: str,
source_table: str,
destination_bucket: str,
destination_prefix: str,
deletion_days_old: int,
):
"""Use bigquery client to export data from BigQuery to GCS."""
client = bigquery.Client(source_project)
error_counter = 0
threshold = 1

try:
# Generate the current timestamp
timestamp = datetime.utcnow().strftime("%Y%m%d%H%M")

# BigQuery does not export the proper JSON format, so we use a temp file and reformat
temp_file = "temp.ndjson"

job_config = bigquery.job.ExtractJobConfig(
destination_format=bigquery.job.DestinationFormat.NEWLINE_DELIMITED_JSON
)

destination_uri = f"gs://{destination_bucket}/{destination_prefix}/{temp_file}"

extract_job = client.extract_table(
source=f"{source_project}.{source_dataset}.{source_table}",
destination_uris=[destination_uri],
job_config=job_config,
)

extract_job.result() # Waits for the job to complete.

# Verify that job was successful
if extract_job.state != "DONE":
log.error("Export failed with errors:", extract_job.errors)
error_counter += 1

# Initialize the storage client
storage_client = storage.Client()
bucket = storage_client.bucket(destination_bucket)
blob = bucket.blob(f"{destination_prefix}/{temp_file}")

# Read the temporary JSON file from GCS
temp_file_content = blob.download_as_text()

# Convert the content to a JSON array
json_array = [json.loads(line) for line in temp_file_content.splitlines()]
json_data = json.dumps(json_array, indent=1)

# Write the JSON array to the final destination files in GCS:
# 1. latest.json is a single file, that's easy to reference from Merino.
# 2. {timestamp}.json keeps a historical record for debugging purposes.
for suffix in ["latest", timestamp]:
final_destination_uri = f"{destination_prefix}/{suffix}.json"
final_blob = bucket.blob(final_destination_uri)
final_blob.upload_from_string(json_data, content_type="application/json")

# Delete the temporary file from GCS
blob.delete()

# Delete files older than 3 days
delete_old_files(bucket, destination_prefix, deletion_days_old)

log.info("Export successful and temporary file deleted")

except Exception as err:
error_counter += 1
log.error(f"An error occurred: {err}")

if error_counter > threshold:
raise Exception(
f"More than the accepted threshold of {threshold} operations failed."
)


def delete_old_files(bucket, prefix, days_old):
"""Delete files older than `days_old` days from the bucket with the given prefix."""
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_old)
blobs = bucket.list_blobs(prefix=prefix)

for blob in blobs:
if blob.updated < cutoff_date:
blob.delete()
log.info(f"Deleted {blob.name}")
21 changes: 21 additions & 0 deletions dags.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1790,6 +1790,27 @@ bqetl_merino_newtab_extract_to_gcs:
- repo/bigquery-etl
- impact/tier_1

bqetl_merino_newtab_priors_to_gcs:
default_args:
depends_on_past: false
email:
- cbeck@mozilla.com
- gkatre@mozilla.com
email_on_failure: true
email_on_retry: false
end_date: null
owner: cbeck@mozilla.com
retries: 2
retry_delay: 5m
start_date: '2024-10-08'
description: |
Aggregates Newtab stats that land in a GCS bucket for Merino to derive Thompson sampling priors.
repo: bigquery-etl
schedule_interval: "0 2 * * *"
tags:
- repo/bigquery-etl
- impact/tier_1

bqetl_dynamic_dau:
default_args:
depends_on_past: false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ scheduling:
- --source-dataset=telemetry_derived
- --source-table=newtab_merino_extract_v1
- --destination-bucket=merino-airflow-data-prodpy
- --destination-prefix=newtab-merino-exports
- --destination-prefix=newtab-merino-exports/engagement
- --deletion-days-old=3
bigquery: null
references: {}
Original file line number Diff line number Diff line change
@@ -1,128 +1,6 @@
import json
import logging
from datetime import datetime, timedelta, timezone

import rich_click as click
from google.cloud import storage # type: ignore
from google.cloud import bigquery


@click.command()
@click.option(
"--source-project",
required=True,
help="Google Cloud Project where the source table is located.",
)
@click.option(
"--source-dataset",
required=True,
help="Dataset in BigQuery where the source table is located.",
)
@click.option(
"--source-table", required=True, help="Name of the source table in BigQuery."
)
@click.option(
"--destination-bucket",
required=True,
help="Destination Google Cloud Storage Bucket.",
)
@click.option(
"--destination-prefix", required=True, help="Prefix of the bucket path in GCS."
)

@click.option(
"--deletion-days-old",
required=True,
type=int,
help="Number of days after which files in GCS should be deleted.",
)

def export_newtab_merino_extract_to_gcs(
source_project: str,
source_dataset: str,
source_table: str,
destination_bucket: str,
destination_prefix: str,
deletion_days_old: int,
):
"""Use bigquery client to export data from BigQuery to GCS."""
client = bigquery.Client(source_project)
error_counter = 0
threshold = 1

try:
# Generate the current timestamp
timestamp = datetime.utcnow().strftime("%Y%m%d%H%M")

# BigQuery does not export the proper JSON format, so we use a temp file and reformat
temp_file = "temp.ndjson"

job_config = bigquery.job.ExtractJobConfig(
destination_format=bigquery.job.DestinationFormat.NEWLINE_DELIMITED_JSON
)

destination_uri = f"gs://{destination_bucket}/{destination_prefix}/{temp_file}"

extract_job = client.extract_table(
source=f"{source_project}.{source_dataset}.{source_table}",
destination_uris=[destination_uri],
job_config=job_config,
)

extract_job.result() # Waits for the job to complete.

# Verify that job was successful
if extract_job.state != "DONE":
logging.error("Export failed with errors:", extract_job.errors)
error_counter += 1

# Initialize the storage client
storage_client = storage.Client()
bucket = storage_client.bucket(destination_bucket)
blob = bucket.blob(f"{destination_prefix}/{temp_file}")

# Read the temporary JSON file from GCS
temp_file_content = blob.download_as_text()

# Convert the content to a JSON array
json_array = [json.loads(line) for line in temp_file_content.splitlines()]

# Write the JSON array to the final destination file in GCS
final_destination_uri = f"{destination_prefix}/engagement_{timestamp}.json"
final_blob = bucket.blob(final_destination_uri)
final_blob.upload_from_string(
json.dumps(json_array, indent=1), content_type="application/json"
)

# Delete the temporary file from GCS
blob.delete()

# Delete files older than 3 days
delete_old_files(bucket, destination_prefix, deletion_days_old)

logging.info("Export successful and temporary file deleted")

except Exception as err:
error_counter += 1
logging.error(f"An error occurred: {err}")

if error_counter > threshold:
raise Exception(
f"More than the accepted threshold of {threshold} operations failed."
)


def delete_old_files(bucket, prefix, days_old):
"""Delete files older than `days_old` days from the bucket with the given prefix."""
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_old)
blobs = bucket.list_blobs(prefix=prefix)

for blob in blobs:
if blob.time_created < cutoff_date:
blob.delete()
logging.info(f"Deleted {blob.name}")
"""Extract New Tab engagement query results and write the combined JSON to a single file."""

from bigquery_etl.newtab_merino import export_newtab_merino_table_to_gcs

if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
export_newtab_merino_extract_to_gcs()
export_newtab_merino_table_to_gcs()
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
friendly_name: Newtab Merino BigQuery Prior stats to Google Cloud Storage (GCS)
description: |-
Newtab stats that inform the Thompson sampling priors are exported to a GCS
bucket for Merino to consume. The table rebuilds daily and aggregates 7 days
of data.
owners:
- cbeck@mozilla.com
- gkatre@mozilla.com
labels:
incremental: false
owner1: cbeck
scheduling:
dag_name: bqetl_merino_newtab_priors_to_gcs
arguments:
- --source-project=moz-fx-data-shared-prod
- --source-dataset=telemetry_derived
- --source-table=newtab_merino_priors_v1
- --destination-bucket=merino-airflow-data-prodpy
- --destination-prefix=newtab-merino-exports/priors
- --deletion-days-old=3
bigquery: null
references: {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Extract Thompson sampling prior query results and write the combined JSON to a single file."""

from bigquery_etl.newtab_merino import export_newtab_merino_table_to_gcs

if __name__ == "__main__":
export_newtab_merino_table_to_gcs()
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
-- macro checks

#fail
{{ not_null(["average_ctr_top2_items"]) }}

#fail
{{ not_null(["impressions_per_item"]) }}

#fail
{{ min_row_count(1) }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
friendly_name: Newtab Merino Priors
description: |-
Queries New Tab stats used by Merino to calculate Thompson sampling priors.
These determine how new items (without engagement data) are ranked on New Tab.
owners:
- cbeck@mozilla.com
- gkatre@mozilla.com
labels:
incremental: false
owner: cbeck
bigquery:
time_partitioning: null
scheduling:
dag_name: bqetl_merino_newtab_priors_to_gcs
date_partition_parameter: null
Loading

1 comment on commit 5e08071

@dataops-ci-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Integration report for "[MC-1458] Add newtab_merino_priors DAG (#6303)"

sql.diff

Click to expand!
Only in /tmp/workspace/generated-sql/dags/: bqetl_merino_newtab_priors_to_gcs.py
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/dags/bqetl_merino_newtab_extract_to_gcs.py /tmp/workspace/generated-sql/dags/bqetl_merino_newtab_extract_to_gcs.py
--- /tmp/workspace/main-generated-sql/dags/bqetl_merino_newtab_extract_to_gcs.py	2024-10-14 17:12:32.000000000 +0000
+++ /tmp/workspace/generated-sql/dags/bqetl_merino_newtab_extract_to_gcs.py	2024-10-14 17:14:22.000000000 +0000
@@ -88,7 +88,7 @@
             "--source-dataset=telemetry_derived",
             "--source-table=newtab_merino_extract_v1",
             "--destination-bucket=merino-airflow-data-prodpy",
-            "--destination-prefix=newtab-merino-exports",
+            "--destination-prefix=newtab-merino-exports/engagement",
             "--deletion-days-old=3",
         ],
         image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/dags/bqetl_merino_newtab_priors_to_gcs.py /tmp/workspace/generated-sql/dags/bqetl_merino_newtab_priors_to_gcs.py
--- /tmp/workspace/main-generated-sql/dags/bqetl_merino_newtab_priors_to_gcs.py	1970-01-01 00:00:00.000000000 +0000
+++ /tmp/workspace/generated-sql/dags/bqetl_merino_newtab_priors_to_gcs.py	2024-10-14 17:14:22.000000000 +0000
@@ -0,0 +1,117 @@
+# Generated via https://github.com/mozilla/bigquery-etl/blob/main/bigquery_etl/query_scheduling/generate_airflow_dags.py
+
+from airflow import DAG
+from airflow.sensors.external_task import ExternalTaskMarker
+from airflow.sensors.external_task import ExternalTaskSensor
+from airflow.utils.task_group import TaskGroup
+import datetime
+from operators.gcp_container_operator import GKEPodOperator
+from utils.constants import ALLOWED_STATES, FAILED_STATES
+from utils.gcp import bigquery_etl_query, bigquery_dq_check
+from bigeye_airflow.operators.run_metrics_operator import RunMetricsOperator
+
+docs = """
+### bqetl_merino_newtab_priors_to_gcs
+
+Built from bigquery-etl repo, [`dags/bqetl_merino_newtab_priors_to_gcs.py`](https://github.com/mozilla/bigquery-etl/blob/generated-sql/dags/bqetl_merino_newtab_priors_to_gcs.py)
+
+#### Description
+
+Aggregates Newtab stats that land in a GCS bucket for Merino to derive Thompson sampling priors.
+
+#### Owner
+
+cbeck@mozilla.com
+
+#### Tags
+
+* impact/tier_1
+* repo/bigquery-etl
+"""
+
+
+default_args = {
+    "owner": "cbeck@mozilla.com",
+    "start_date": datetime.datetime(2024, 10, 8, 0, 0),
+    "end_date": None,
+    "email": ["cbeck@mozilla.com", "gkatre@mozilla.com"],
+    "depends_on_past": False,
+    "retry_delay": datetime.timedelta(seconds=300),
+    "email_on_failure": True,
+    "email_on_retry": False,
+    "retries": 2,
+}
+
+tags = ["impact/tier_1", "repo/bigquery-etl"]
+
+with DAG(
+    "bqetl_merino_newtab_priors_to_gcs",
+    default_args=default_args,
+    schedule_interval="0 2 * * *",
+    doc_md=docs,
+    tags=tags,
+) as dag:
+
+    wait_for_copy_deduplicate_all = ExternalTaskSensor(
+        task_id="wait_for_copy_deduplicate_all",
+        external_dag_id="copy_deduplicate",
+        external_task_id="copy_deduplicate_all",
+        execution_delta=datetime.timedelta(seconds=3600),
+        check_existence=True,
+        mode="reschedule",
+        allowed_states=ALLOWED_STATES,
+        failed_states=FAILED_STATES,
+        pool="DATA_ENG_EXTERNALTASKSENSOR",
+    )
+
+    checks__fail_telemetry_derived__newtab_merino_priors__v1 = bigquery_dq_check(
+        task_id="checks__fail_telemetry_derived__newtab_merino_priors__v1",
+        source_table="newtab_merino_priors_v1",
+        dataset_id="telemetry_derived",
+        project_id="moz-fx-data-shared-prod",
+        is_dq_check_fail=True,
+        owner="cbeck@mozilla.com",
+        email=["cbeck@mozilla.com", "gkatre@mozilla.com"],
+        depends_on_past=False,
+        task_concurrency=1,
+        retries=0,
+    )
+
+    telemetry_derived__newtab_merino_priors__v1 = bigquery_etl_query(
+        task_id="telemetry_derived__newtab_merino_priors__v1",
+        destination_table="newtab_merino_priors_v1",
+        dataset_id="telemetry_derived",
+        project_id="moz-fx-data-shared-prod",
+        owner="cbeck@mozilla.com",
+        email=["cbeck@mozilla.com", "gkatre@mozilla.com"],
+        date_partition_parameter=None,
+        depends_on_past=False,
+        task_concurrency=1,
+    )
+
+    telemetry_derived__newtab_merino_priors_to_gcs__v1 = GKEPodOperator(
+        task_id="telemetry_derived__newtab_merino_priors_to_gcs__v1",
+        arguments=[
+            "python",
+            "sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_to_gcs_v1/query.py",
+        ]
+        + [
+            "--source-project=moz-fx-data-shared-prod",
+            "--source-dataset=telemetry_derived",
+            "--source-table=newtab_merino_priors_v1",
+            "--destination-bucket=merino-airflow-data-prodpy",
+            "--destination-prefix=newtab-merino-exports/priors",
+            "--deletion-days-old=3",
+        ],
+        image="gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest",
+        owner="cbeck@mozilla.com",
+        email=["cbeck@mozilla.com", "gkatre@mozilla.com"],
+    )
+
+    checks__fail_telemetry_derived__newtab_merino_priors__v1.set_upstream(
+        telemetry_derived__newtab_merino_priors__v1
+    )
+
+    telemetry_derived__newtab_merino_priors__v1.set_upstream(
+        wait_for_copy_deduplicate_all
+    )
Only in /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived: newtab_merino_priors_to_gcs_v1
Only in /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived: newtab_merino_priors_v1
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_extract_to_gcs_v1/metadata.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_extract_to_gcs_v1/metadata.yaml
--- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_extract_to_gcs_v1/metadata.yaml	2024-10-14 17:09:21.000000000 +0000
+++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_extract_to_gcs_v1/metadata.yaml	2024-10-14 17:09:38.000000000 +0000
@@ -16,7 +16,7 @@
   - --source-dataset=telemetry_derived
   - --source-table=newtab_merino_extract_v1
   - --destination-bucket=merino-airflow-data-prodpy
-  - --destination-prefix=newtab-merino-exports
+  - --destination-prefix=newtab-merino-exports/engagement
   - --deletion-days-old=3
 bigquery: null
 workgroup_access:
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_extract_to_gcs_v1/query.py /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_extract_to_gcs_v1/query.py
--- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_extract_to_gcs_v1/query.py	2024-10-14 17:07:25.000000000 +0000
+++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_extract_to_gcs_v1/query.py	2024-10-14 17:07:26.000000000 +0000
@@ -1,128 +1,6 @@
-import json
-import logging
-from datetime import datetime, timedelta, timezone
-
-import rich_click as click
-from google.cloud import storage  # type: ignore
-from google.cloud import bigquery
-
-
-@click.command()
-@click.option(
-    "--source-project",
-    required=True,
-    help="Google Cloud Project where the source table is located.",
-)
-@click.option(
-    "--source-dataset",
-    required=True,
-    help="Dataset in BigQuery where the source table is located.",
-)
-@click.option(
-    "--source-table", required=True, help="Name of the source table in BigQuery."
-)
-@click.option(
-    "--destination-bucket",
-    required=True,
-    help="Destination Google Cloud Storage Bucket.",
-)
-@click.option(
-    "--destination-prefix", required=True, help="Prefix of the bucket path in GCS."
-)
-
-@click.option(
-    "--deletion-days-old",
-    required=True,
-    type=int,
-    help="Number of days after which files in GCS should be deleted.",
-)
-
-def export_newtab_merino_extract_to_gcs(
-    source_project: str,
-    source_dataset: str,
-    source_table: str,
-    destination_bucket: str,
-    destination_prefix: str,
-    deletion_days_old: int,
-):
-    """Use bigquery client to export data from BigQuery to GCS."""
-    client = bigquery.Client(source_project)
-    error_counter = 0
-    threshold = 1
-
-    try:
-        # Generate the current timestamp
-        timestamp = datetime.utcnow().strftime("%Y%m%d%H%M")
-
-        # BigQuery does not export the proper JSON format, so we use a temp file and reformat
-        temp_file = "temp.ndjson"
-
-        job_config = bigquery.job.ExtractJobConfig(
-            destination_format=bigquery.job.DestinationFormat.NEWLINE_DELIMITED_JSON
-        )
-
-        destination_uri = f"gs://{destination_bucket}/{destination_prefix}/{temp_file}"
-
-        extract_job = client.extract_table(
-            source=f"{source_project}.{source_dataset}.{source_table}",
-            destination_uris=[destination_uri],
-            job_config=job_config,
-        )
-
-        extract_job.result()  # Waits for the job to complete.
-
-        # Verify that job was successful
-        if extract_job.state != "DONE":
-            logging.error("Export failed with errors:", extract_job.errors)
-            error_counter += 1
-
-        # Initialize the storage client
-        storage_client = storage.Client()
-        bucket = storage_client.bucket(destination_bucket)
-        blob = bucket.blob(f"{destination_prefix}/{temp_file}")
-
-        # Read the temporary JSON file from GCS
-        temp_file_content = blob.download_as_text()
-
-        # Convert the content to a JSON array
-        json_array = [json.loads(line) for line in temp_file_content.splitlines()]
-
-        # Write the JSON array to the final destination file in GCS
-        final_destination_uri = f"{destination_prefix}/engagement_{timestamp}.json"
-        final_blob = bucket.blob(final_destination_uri)
-        final_blob.upload_from_string(
-            json.dumps(json_array, indent=1), content_type="application/json"
-        )
-
-        # Delete the temporary file from GCS
-        blob.delete()
-
-        # Delete files older than 3 days
-        delete_old_files(bucket, destination_prefix, deletion_days_old)
-
-        logging.info("Export successful and temporary file deleted")
-
-    except Exception as err:
-        error_counter += 1
-        logging.error(f"An error occurred: {err}")
-
-        if error_counter > threshold:
-            raise Exception(
-                f"More than the accepted threshold of {threshold} operations failed."
-            )
-
-
-def delete_old_files(bucket, prefix, days_old):
-    """Delete files older than `days_old` days from the bucket with the given prefix."""
-    cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_old)
-    blobs = bucket.list_blobs(prefix=prefix)
-
-    for blob in blobs:
-        if blob.time_created < cutoff_date:
-            blob.delete()
-            logging.info(f"Deleted {blob.name}")
+"""Extract New Tab engagement query results and write the combined JSON to a single file."""
 
+from bigquery_etl.newtab_merino import export_newtab_merino_table_to_gcs
 
 if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
-    export_newtab_merino_extract_to_gcs()
+    export_newtab_merino_table_to_gcs()
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_to_gcs_v1/metadata.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_to_gcs_v1/metadata.yaml
--- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_to_gcs_v1/metadata.yaml	1970-01-01 00:00:00.000000000 +0000
+++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_to_gcs_v1/metadata.yaml	2024-10-14 17:09:38.000000000 +0000
@@ -0,0 +1,27 @@
+friendly_name: Newtab Merino BigQuery Prior stats to Google Cloud Storage (GCS)
+description: |-
+  Newtab stats that inform the Thompson sampling priors are exported to a GCS
+  bucket for Merino to consume. The table rebuilds daily and aggregates 7 days
+  of data.
+owners:
+- cbeck@mozilla.com
+- gkatre@mozilla.com
+labels:
+  owner1: cbeck
+  dag: bqetl_merino_newtab_priors_to_gcs
+  owner2: gkatre
+scheduling:
+  dag_name: bqetl_merino_newtab_priors_to_gcs
+  arguments:
+  - --source-project=moz-fx-data-shared-prod
+  - --source-dataset=telemetry_derived
+  - --source-table=newtab_merino_priors_v1
+  - --destination-bucket=merino-airflow-data-prodpy
+  - --destination-prefix=newtab-merino-exports/priors
+  - --deletion-days-old=3
+bigquery: null
+workgroup_access:
+- role: roles/bigquery.dataViewer
+  members:
+  - workgroup:mozilla-confidential
+references: {}
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_to_gcs_v1/query.py /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_to_gcs_v1/query.py
--- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_to_gcs_v1/query.py	1970-01-01 00:00:00.000000000 +0000
+++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_to_gcs_v1/query.py	2024-10-14 17:07:26.000000000 +0000
@@ -0,0 +1,6 @@
+"""Extract Thompson sampling prior query results and write the combined JSON to a single file."""
+
+from bigquery_etl.newtab_merino import export_newtab_merino_table_to_gcs
+
+if __name__ == "__main__":
+    export_newtab_merino_table_to_gcs()
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/checks.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/checks.sql
--- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/checks.sql	1970-01-01 00:00:00.000000000 +0000
+++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/checks.sql	2024-10-14 17:07:26.000000000 +0000
@@ -0,0 +1,10 @@
+-- macro checks
+
+#fail
+{{ not_null(["average_ctr_top2_items"]) }}
+
+#fail
+{{ not_null(["impressions_per_item"]) }}
+
+#fail
+{{ min_row_count(1) }}
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/metadata.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/metadata.yaml
--- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/metadata.yaml	1970-01-01 00:00:00.000000000 +0000
+++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/metadata.yaml	2024-10-14 17:09:38.000000000 +0000
@@ -0,0 +1,28 @@
+friendly_name: Newtab Merino Priors
+description: |-
+  Queries New Tab stats used by Merino to calculate Thompson sampling priors.
+  These determine how new items (without engagement data) are ranked on New Tab.
+owners:
+- cbeck@mozilla.com
+- gkatre@mozilla.com
+labels:
+  owner: cbeck
+  dag: bqetl_merino_newtab_priors_to_gcs
+  owner1: cbeck
+  owner2: gkatre
+scheduling:
+  dag_name: bqetl_merino_newtab_priors_to_gcs
+  date_partition_parameter: null
+bigquery:
+  time_partitioning: null
+  range_partitioning: null
+  clustering: null
+workgroup_access:
+- role: roles/bigquery.dataViewer
+  members:
+  - workgroup:mozilla-confidential
+references:
+  checks.sql:
+  - ..
+  query.sql:
+  - moz-fx-data-shared-prod.firefox_desktop.newtab
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/query.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/query.sql
--- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/query.sql	1970-01-01 00:00:00.000000000 +0000
+++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/query.sql	2024-10-14 17:07:26.000000000 +0000
@@ -0,0 +1,199 @@
+WITH
+-- Define common parameters
+params AS (
+  SELECT
+    TIMESTAMP_TRUNC(CURRENT_TIMESTAMP(), DAY) AS end_timestamp,
+    TIMESTAMP_TRUNC(CURRENT_TIMESTAMP(), DAY) - INTERVAL 7 DAY AS start_timestamp
+),
+-- Flatten events and filter relevant data
+flattened_newtab_events AS (
+  SELECT
+    sub.*
+  FROM
+    (
+      SELECT
+        submission_timestamp,
+        normalized_country_code AS region,
+        event.name AS event_name,
+        SAFE_CAST(
+          mozfun.map.get_key(event.extra, 'scheduled_corpus_item_id') AS STRING
+        ) AS scheduled_corpus_item_id,
+        SAFE_CAST(mozfun.map.get_key(event.extra, 'recommended_at') AS INT64) AS recommended_at
+      FROM
+        `moz-fx-data-shared-prod.firefox_desktop.newtab`,
+        UNNEST(events) AS event,
+        params
+      WHERE
+        submission_timestamp >= params.start_timestamp
+        AND submission_timestamp < params.end_timestamp
+        AND event.category = 'pocket'
+        AND event.name IN ('impression', 'click')
+        AND mozfun.map.get_key(event.extra, 'scheduled_corpus_item_id') IS NOT NULL
+        AND SAFE_CAST(mozfun.map.get_key(event.extra, 'recommended_at') AS INT64) IS NOT NULL
+    ) AS sub,
+    params
+  WHERE
+    TIMESTAMP_MILLIS(recommended_at) >= params.start_timestamp
+    AND TIMESTAMP_MILLIS(recommended_at) < params.end_timestamp
+),
+-- Aggregate events by scheduled_corpus_item_id and region
+aggregated_events AS (
+  SELECT
+    scheduled_corpus_item_id,
+    region,
+    SUM(IF(event_name = 'impression', 1, 0)) AS impression_count,
+    SUM(IF(event_name = 'click', 1, 0)) AS click_count
+  FROM
+    flattened_newtab_events
+  GROUP BY
+    scheduled_corpus_item_id,
+    region
+),
+-- Calculate CTR per scheduled_corpus_item_id and region
+per_region_ctr AS (
+  SELECT
+    scheduled_corpus_item_id,
+    region,
+    SAFE_DIVIDE(click_count, impression_count) AS ctr,
+    impression_count,
+    click_count
+  FROM
+    aggregated_events
+  WHERE
+    impression_count > 0
+),
+-- Calculate average impressions per item per region and round to whole number
+per_region_impressions_per_item AS (
+  SELECT
+    region,
+    ROUND(AVG(impression_count)) AS impressions_per_item
+  FROM
+    aggregated_events
+  GROUP BY
+    region
+),
+-- Rank items by click_count per region
+ranked_per_region AS (
+  SELECT
+    *,
+    ROW_NUMBER() OVER (PARTITION BY region ORDER BY click_count DESC) AS rank
+  FROM
+    per_region_ctr
+),
+-- Select top 2 items per region
+top2_per_region AS (
+  SELECT
+    scheduled_corpus_item_id,
+    region,
+    ctr
+  FROM
+    ranked_per_region
+  WHERE
+    rank <= 2
+),
+-- Calculate average CTR of top 2 items per region
+per_region_stats AS (
+  SELECT
+    region,
+    AVG(ctr) AS average_ctr_top2_items
+  FROM
+    top2_per_region
+  GROUP BY
+    region
+),
+-- Combine per-region stats with impressions_per_item
+per_region_stats_with_impressions AS (
+  SELECT
+    s.region,
+    s.average_ctr_top2_items,
+    i.impressions_per_item
+  FROM
+    per_region_stats s
+  JOIN
+    per_region_impressions_per_item i
+    USING (region)
+),
+-- Aggregate events globally
+aggregated_events_global AS (
+  SELECT
+    scheduled_corpus_item_id,
+    SUM(impression_count) AS impression_count,
+    SUM(click_count) AS click_count
+  FROM
+    aggregated_events
+  GROUP BY
+    scheduled_corpus_item_id
+),
+-- Calculate CTR per scheduled_corpus_item_id globally
+per_global_ctr AS (
+  SELECT
+    scheduled_corpus_item_id,
+    SAFE_DIVIDE(click_count, impression_count) AS ctr,
+    impression_count,
+    click_count
+  FROM
+    aggregated_events_global
+  WHERE
+    impression_count > 0
+),
+-- Calculate average impressions per item globally and round to whole number
+global_impressions_per_item AS (
+  SELECT
+    CAST(NULL AS STRING) AS region,
+    ROUND(AVG(impression_count)) AS impressions_per_item
+  FROM
+    aggregated_events_global
+),
+-- Rank items by click_count globally
+ranked_global AS (
+  SELECT
+    *,
+    ROW_NUMBER() OVER (ORDER BY click_count DESC) AS rank
+  FROM
+    per_global_ctr
+),
+-- Select top 2 items globally
+top2_global AS (
+  SELECT
+    scheduled_corpus_item_id,
+    ctr
+  FROM
+    ranked_global
+  WHERE
+    rank <= 2
+),
+-- Calculate average CTR of top 2 items globally
+global_stats AS (
+  SELECT
+    CAST(NULL AS STRING) AS region,
+    AVG(ctr) AS average_ctr_top2_items
+  FROM
+    top2_global
+),
+-- Combine global stats with impressions_per_item
+global_stats_with_impressions AS (
+  SELECT
+    s.region,
+    s.average_ctr_top2_items,
+    i.impressions_per_item
+  FROM
+    global_stats s
+  CROSS JOIN
+    global_impressions_per_item i
+)
+-- Final output combining per-region and global statistics
+SELECT
+  region,
+  average_ctr_top2_items,
+  impressions_per_item
+FROM
+  per_region_stats_with_impressions
+UNION ALL
+SELECT
+  region,
+  average_ctr_top2_items,
+  impressions_per_item
+FROM
+  global_stats_with_impressions
+ORDER BY
+  impressions_per_item DESC;
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/schema.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/schema.yaml
--- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/schema.yaml	1970-01-01 00:00:00.000000000 +0000
+++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/newtab_merino_priors_v1/schema.yaml	2024-10-14 17:07:26.000000000 +0000
@@ -0,0 +1,10 @@
+fields:
+- mode: NULLABLE
+  name: region
+  type: STRING
+- mode: NULLABLE
+  name: average_ctr_top2_items
+  type: FLOAT
+- mode: NULLABLE
+  name: impressions_per_item
+  type: FLOAT

Link to full diff

Please sign in to comment.