From b45448bb2c3d15cb5b34fb40d51a1d879479b4fe Mon Sep 17 00:00:00 2001
From: bamader <49412165+bamader@users.noreply.github.com>
Date: Tue, 2 May 2023 12:59:40 -0400
Subject: [PATCH 01/12] Add filter to MRN values (#522)

* Add filter to MRN values

* Fix string comp test
---
 phdi/linkage/link.py                              |  2 +-
 phdi/linkage/postgres.py                          |  2 +-
 tests/assets/patient_bundle_to_link_with_mpi.json | 12 ++++++++++++
 tests/linkage/test_linkage.py                     |  2 ++
 tests/linkage/test_postgres_mpi_connector.py      |  2 +-
 5 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/phdi/linkage/link.py b/phdi/linkage/link.py
index 367b317214..d389af11b5 100644
--- a/phdi/linkage/link.py
+++ b/phdi/linkage/link.py
@@ -24,7 +24,7 @@
     "city": "Patient.address.city",
     "state": "Patient.address.state",
     "sex": "Patient.gender",
-    "mrn": "Patient.identifier.value",
+    "mrn": "Patient.identifier.where(type.coding.code='MR').value",
 }
 
 
diff --git a/phdi/linkage/postgres.py b/phdi/linkage/postgres.py
index fb255f2dac..ab4a4c48ec 100644
--- a/phdi/linkage/postgres.py
+++ b/phdi/linkage/postgres.py
@@ -43,7 +43,7 @@ def __init__(
             "city": """$.address[*].city""",
             "first_name": """$.name[*].given""",
             "last_name": """$.name[*].family""",
-            "mrn": """$.identifier.value""",
+            "mrn": """$.identifier ?(@.type.coding[0].code=="MR").value""",
             "sex": "$.gender",
             "state": """$.address[*].state""",
             "zip": """$.address[*].postalCode""",
diff --git a/tests/assets/patient_bundle_to_link_with_mpi.json b/tests/assets/patient_bundle_to_link_with_mpi.json
index 2d0b47dcc9..9e67afbc48 100644
--- a/tests/assets/patient_bundle_to_link_with_mpi.json
+++ b/tests/assets/patient_bundle_to_link_with_mpi.json
@@ -113,6 +113,18 @@
                 "resourceType": "Patient",
                 "id": "2c6d5fd1-4a70-11eb-99fd-ad786a821574",
                 "identifier": [
+                    {
+                        "value": "649-555-0120",
+                        "type": {
+                            "coding": [
+                                {
+                                    "code": "SSN",
+                                    "system": "http://terminology.hl7.org/CodeSystem/v2-0203",
+                                    "display": "Social security number"
+                                }
+                            ]
+                        }
+                    },
                     {
                         "value": "7894561235",
                         "type": {
diff --git a/tests/linkage/test_linkage.py b/tests/linkage/test_linkage.py
index 586afdca09..a607c0718f 100644
--- a/tests/linkage/test_linkage.py
+++ b/tests/linkage/test_linkage.py
@@ -1227,6 +1227,8 @@ def test_flatten_patient():
         for p in patients
         if p.get("resource", {}).get("resourceType", "") == "Patient"
     ]
+
+    # Use patient with multiple identifiers to also test MRN-specific filter
     assert _flatten_patient_resource(patients[2]) == [
         "2c6d5fd1-4a70-11eb-99fd-ad786a821574",
         None,
diff --git a/tests/linkage/test_postgres_mpi_connector.py b/tests/linkage/test_postgres_mpi_connector.py
index 9118f8e996..9f73c1bc74 100644
--- a/tests/linkage/test_postgres_mpi_connector.py
+++ b/tests/linkage/test_postgres_mpi_connector.py
@@ -33,7 +33,7 @@ def test_generate_block_query():
         "$.address[*].city",
         "$.name[*].given",
         "$.name[*].family",
-        "$.identifier.value",
+        '$.identifier ?(@.type.coding[0].code=="MR").value',
         "$.gender",
         "$.address[*].state",
         "$.address[*].postalCode",

From 043bb2394ac956a2eb56a58228e5b90c3ec68108 Mon Sep 17 00:00:00 2001
From: gordonfarrell <93161643+gordonfarrell@users.noreply.github.com>
Date: Wed, 3 May 2023 13:35:29 -0600
Subject: [PATCH 02/12] Run record linkage migrations on spin up not RL
 endpoint request (#518)

* Run record linkage migrations on spin up not RL endpoint request
---
 .github/workflows/test.yaml                   | 10 ++++
 containers/record-linkage/app/main.py         |  6 ++-
 .../tests/test_record_linkage.py              | 53 +++++--------------
 3 files changed, 26 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 9c047e44c3..ab4dfc2a8b 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -144,6 +144,7 @@ jobs:
           POSTGRES_PASSWORD: pw
           POSTGRES_DB: testdb
           POSTGRES_USER: postgres
+
         # Set health checks to wait until postgres has started
         options: >-
           --health-cmd pg_isready
@@ -171,6 +172,15 @@ jobs:
         run: |
           pip install -r requirements.txt
       - name: Run unit tests for containers
+        env:
+          MPI_DBNAME: testdb
+          MPI_PASSWORD: pw
+          MPI_DB_TYPE: postgres
+          MPI_HOST: localhost
+          MPI_USER: postgres
+          MPI_PORT: 5432
+          MPI_PATIENT_TABLE: patient
+          MPI_PERSON_TABLE: person
         working-directory: ./containers/${{matrix.container-to-test}}/tests
         run: |
           python -m pytest
diff --git a/containers/record-linkage/app/main.py b/containers/record-linkage/app/main.py
index e0de731978..1a3dbafa5d 100644
--- a/containers/record-linkage/app/main.py
+++ b/containers/record-linkage/app/main.py
@@ -62,6 +62,9 @@ def run_migrations():
             print_psycopg2_exception(err)
 
 
+# Run MPI migrations on spin up
+run_migrations()
+
 # Instantiate FastAPI and set metadata.
 description = (Path(__file__).parent.parent / "description.md").read_text(
     encoding="utf-8"
@@ -143,7 +146,7 @@ async def health_check() -> HealthCheckResponse:
     linkage service is available and running properly. The mpi_connection_status field
     contains a description of the connection health to the MPI database.
     """
-    run_migrations()
+
     try:
         connect_to_mpi_with_env_vars()
     except Exception as err:
@@ -166,7 +169,6 @@ async def link_record(input: LinkRecordInput, response: Response) -> LinkRecordR
 
     input = dict(input)
     input_bundle = input.get("bundle", {})
-    run_migrations()
 
     # Check that DB type is appropriately set up as Postgres so
     # we can fail fast if it's not
diff --git a/containers/record-linkage/tests/test_record_linkage.py b/containers/record-linkage/tests/test_record_linkage.py
index dc2eb3959f..ebb69b2b86 100644
--- a/containers/record-linkage/tests/test_record_linkage.py
+++ b/containers/record-linkage/tests/test_record_linkage.py
@@ -2,24 +2,12 @@
 from fastapi.testclient import TestClient
 from app.config import get_settings
 from app.main import app
-from pydantic import ValidationError
 
 import copy
 import json
 import os
 import pathlib
 import psycopg2
-import pytest
-
-client = TestClient(app)
-
-test_bundle = json.load(
-    open(
-        pathlib.Path(__file__).parent
-        / "assets"
-        / "patient_bundle_to_link_with_mpi.json"
-    )
-)
 
 
 def set_mpi_env_vars():
@@ -34,6 +22,18 @@ def set_mpi_env_vars():
     get_settings.cache_clear()
 
 
+client = TestClient(app)
+
+
+test_bundle = json.load(
+    open(
+        pathlib.Path(__file__).parent
+        / "assets"
+        / "patient_bundle_to_link_with_mpi.json"
+    )
+)
+
+
 def pop_mpi_env_vars():
     os.environ.pop("mpi_db_type", None)
     os.environ.pop("mpi_dbname", None)
@@ -168,32 +168,3 @@ def test_linkage_success():
     dbconn.close()
 
     pop_mpi_env_vars()
-
-
-def test_linkage_invalid_postgres_settings():
-    set_mpi_env_vars()
-    for setting in [
-        "mpi_dbname",
-        "mpi_user",
-        "mpi_password",
-        "mpi_host",
-        "mpi_port",
-        "mpi_patient_table",
-        "mpi_person_table",
-    ]:
-        removed_setting = os.environ[setting]
-        os.environ.pop(setting, None)
-        get_settings.cache_clear()
-
-        with pytest.raises(ValidationError) as e:
-            client.post("/link-record", json={"bundle": test_bundle})
-            assert "validation errors for Settings" in str(e.value)
-
-        os.environ[setting] = "invalid_value"
-        get_settings.cache_clear()
-        actual_response = client.post("/link-record", json={"bundle": test_bundle})
-        assert actual_response.status_code == status.HTTP_400_BAD_REQUEST
-        assert "Could not connect to database" in actual_response.json()["message"]
-        os.environ[setting] = removed_setting
-
-    pop_mpi_env_vars()

From 7e2331edb2f7f129291c73ce316850962a10e7f8 Mon Sep 17 00:00:00 2001
From: Brady Fausett at Skylight <brady@skylight.digital>
Date: Wed, 3 May 2023 15:02:02 -0600
Subject: [PATCH 03/12] New PHDI Release Workflow (#517)

* Created new github workflow for New Release - Creates new tag, then new release based upon new tag, builds containers for new tag/release, updates documentation for new tag/release

* Created workflow to create new release with tags with updated containers and updated documentation

* Update pyproject.toml

---------

Co-authored-by: Nick Clyde <nick@clyde.tech>
---
 .github/release.yml                           | 11 +++
 ...iners.yaml => buildReleaseContainers.yaml} | 32 +++++--
 .github/workflows/createNewRelease.yaml       | 85 +++++++++++++++++++
 .github/workflows/tagRelease.yaml             | 55 ------------
 .github/workflows/test.yaml                   |  5 ++
 containers/alerts/app/main.py                 | 24 ++----
 containers/alerts/requirements.txt            |  1 +
 phdi/containers/base_service.py               |  3 +-
 pyproject.toml                                |  4 +-
 tests/containers/test_base_service.py         |  3 +-
 tests/not_valid_json_test.json                |  1 +
 tests/test_phdi_building_blocks.py            |  8 +-
 12 files changed, 145 insertions(+), 87 deletions(-)
 create mode 100644 .github/release.yml
 rename .github/workflows/{buildContainers.yaml => buildReleaseContainers.yaml} (68%)
 create mode 100644 .github/workflows/createNewRelease.yaml
 delete mode 100644 .github/workflows/tagRelease.yaml
 create mode 100644 tests/not_valid_json_test.json

diff --git a/.github/release.yml b/.github/release.yml
new file mode 100644
index 0000000000..7aa8ef72bd
--- /dev/null
+++ b/.github/release.yml
@@ -0,0 +1,11 @@
+changelog:
+  categories:
+    - title: 🏕 Features
+      labels:
+        - '*'
+      exclude:
+        labels:
+          - dependencies
+    - title: 👒 Dependencies
+      labels:
+        - dependencies
diff --git a/.github/workflows/buildContainers.yaml b/.github/workflows/buildReleaseContainers.yaml
similarity index 68%
rename from .github/workflows/buildContainers.yaml
rename to .github/workflows/buildReleaseContainers.yaml
index 4633156c71..35928d4442 100644
--- a/.github/workflows/buildContainers.yaml
+++ b/.github/workflows/buildReleaseContainers.yaml
@@ -1,13 +1,16 @@
-name: Build containers
+name: Build Release Specific Containers
 
 on:
+  workflow_call:
+    inputs:
+      container-tag:
+        type: string
+        required: true
   workflow_dispatch:
-  push:
-    branches:
-      - main
-    paths:
-      - "containers/**"
-      - ".github/workflows/buildContainers.yaml"
+    inputs:
+      container-tag:
+        type: string
+        required: true
 
 jobs:
   list-containers:
@@ -19,6 +22,9 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ inputs.container-tag }}
       - name: Install dependencies
         run: |
           sudo apt-get install jq
@@ -43,6 +49,9 @@ jobs:
     steps:
       - name: Check Out Changes
         uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ inputs.container-tag }}
 
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
@@ -58,7 +67,16 @@ jobs:
         id: meta
         uses: docker/metadata-action@v4
         with:
+          ref: ${{ inputs.container-tag }}
           images: ghcr.io/${{ github.repository }}/${{matrix.container-to-build}}
+          # this sets the version for tags and labels for each of the containers to be
+          # be the same as the version/tag where the code was pulled from
+          tags: |
+            type=semver,pattern={{raw}},value=${{ inputs.container-tag }}
+            type=ref,event=branch
+            type=ref,event=tag,pattern={{raw}},value=${{ inputs.container-tag }}
+          labels: |
+            org.opencontainers.image.version=${{ inputs.container-tag }}
 
       - name: Build and push
         uses: docker/build-push-action@v3
diff --git a/.github/workflows/createNewRelease.yaml b/.github/workflows/createNewRelease.yaml
new file mode 100644
index 0000000000..dc0910dd9e
--- /dev/null
+++ b/.github/workflows/createNewRelease.yaml
@@ -0,0 +1,85 @@
+name: Create New Release
+on:
+  workflow_dispatch:
+    inputs:
+      custom_tag:
+        name: Set Release Tag Version
+        description: "Enter version number for release tag below. Don't forget the v! Example: v2.23.9"
+        type: string
+        required: true
+
+# Run all tests before making a release
+jobs:
+  test-for-release:
+    uses: ./.github/workflows/test.yaml
+
+
+  update-version-tag-release:
+    name: Update phdi init version number
+    needs: test-for-release
+    permissions:
+      contents: write
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: '0'
+          ref: 'main'
+      - name: Install poetry and dependencies
+        run: |
+          pip install poetry
+      # update the version number in the phdi/__init.py__ file
+      - name: Update PHDI Version
+        run: |
+          poetry version ${{ github.event.inputs.custom_tag }}
+      # Create new release tag based upon user input
+      - uses: EndBug/add-and-commit@v9
+        with:
+          message: Create release ${{ github.event.inputs.custom_tag }}
+          add: pyproject.toml
+          tag: ${{ github.event.inputs.custom_tag }}
+      # Create new release based upon the latest created tag
+      - name: Create Release
+        id: create_release
+        uses: actions/create-release@v1.1.4
+        env: 
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ github.event.inputs.custom_tag }}
+          release_name: Release ${{ github.event.inputs.custom_tag }}    
+
+# Rebuild all containers for the new release
+  build-containers-for-release:
+    needs: update-version-tag-release
+    uses: ./.github/workflows/buildReleaseContainers.yaml
+    with:
+      container-tag: ${{ github.event.inputs.custom_tag }}
+
+# Create updated docs for the latest release
+  generate-and-update-docs:
+    needs: build-containers-for-release
+    permissions:
+      contents: write
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: '0'
+          ref: ${{ github.event.inputs.custom_tag }}
+
+      - name: Install poetry and dependencies
+        run: |
+          pip install poetry
+          poetry install
+
+      - name: Generate docs and move to docs branch
+        run: |
+          poetry run pdoc ./phdi -o ./docs/${{ github.event.inputs.custom_tag }}
+          git checkout docs --
+          rm -rf ./docs/latest
+          cp -r ./docs/${{ github.event.inputs.custom_tag }} ./docs/latest
+
+      - uses: EndBug/add-and-commit@v9
+        with:
+          message: Automated update of API docs for ${{ github.event.inputs.custom_tag }} release.
+          add: docs
diff --git a/.github/workflows/tagRelease.yaml b/.github/workflows/tagRelease.yaml
deleted file mode 100644
index cf6c85b00c..0000000000
--- a/.github/workflows/tagRelease.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-name: Test, Tag, Document
-on:
-  workflow_dispatch:
-    inputs:
-      custom_tag:
-        description: "Enter version number for release tag below. Don't forget the v! Example: v2.23.9"
-        type: string
-        required: true
-
-jobs:
-  test:
-    uses: ./.github/workflows/test.yaml
-
-  create-tagged-release:
-    needs: test
-    permissions:
-      contents: write
-    runs-on: ubuntu-22.04
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          fetch-depth: '0'
-
-      - name: Update version and push tag
-        uses: anothrNick/github-tag-action@1.55.0 # Don't use @master unless you're happy to test the latest version
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          CUSTOM_TAG: ${{ github.event.inputs.custom_tag }}
-
-  generate-and-update-docs:
-    needs: create-tagged-release
-    permissions:
-      contents: write
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          fetch-depth: '0'
-
-      - name: Install poetry and dependencies
-        run: |
-          pip install poetry
-          poetry install
-
-      - name: Generate docs and move to docs branch
-        run: |
-          poetry run pdoc ./phdi -o ./docs/${{ github.event.inputs.custom_tag }}
-          git checkout docs --
-          rm -rf ./docs/latest
-          cp -r ./docs/${{ github.event.inputs.custom_tag }} ./docs/latest
-
-      - uses: stefanzweifel/git-auto-commit-action@v4
-        with:
-          commit_message: Automated update of API docs for ${{ github.event.inputs.custom_tag }} release.
-          branch: docs
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index ab4dfc2a8b..dfea105a19 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -1,5 +1,6 @@
 name: Test
 on:
+  workflow_call:
   workflow_dispatch:
   pull_request:
     branches:
@@ -7,6 +8,10 @@ on:
   push:
     branches:
       - main
+    paths-ignore: 
+      - pyproject.toml
+
+
 
 env:
   TEST_RUNNER_PYTHON_VERSION: 3.9
diff --git a/containers/alerts/app/main.py b/containers/alerts/app/main.py
index 23c380b437..c0b5415fea 100644
--- a/containers/alerts/app/main.py
+++ b/containers/alerts/app/main.py
@@ -8,7 +8,7 @@
 )
 from azure.communication.sms import SmsClient
 from azure.identity import DefaultAzureCredential
-from fastapi import FastAPI, Response, status
+from fastapi import Response, status
 from pydantic import BaseModel, BaseSettings, Field
 import pymsteams
 from functools import lru_cache
@@ -16,25 +16,13 @@
 from slack_sdk import WebClient
 from slack_sdk.errors import SlackApiError
 from typing import Optional
+from phdi.containers.base_service import BaseService
+
 
 # Instantiate FastAPI and set metadata.
-description = (Path(__file__).parent.parent / "description.md").read_text(
-    encoding="utf-8"
-)
-api = FastAPI(
-    title="PHDI Alerts Service",
-    version="0.0.1",
-    contact={
-        "name": "CDC Public Health Data Infrastructure",
-        "url": "https://cdcgov.github.io/phdi-site/",
-        "email": "dmibuildingblocks@cdc.gov",
-    },
-    license_info={
-        "name": "Creative Commons Zero v1.0 Universal",
-        "url": "https://creativecommons.org/publicdomain/zero/1.0/",
-    },
-    description=description,
-)
+api = BaseService(
+    "PHDI Alerts Service", Path(__file__).parent.parent / "description.md"
+).start()
 
 
 class Settings(BaseSettings):
diff --git a/containers/alerts/requirements.txt b/containers/alerts/requirements.txt
index aa835a636f..3614706d80 100644
--- a/containers/alerts/requirements.txt
+++ b/containers/alerts/requirements.txt
@@ -2,6 +2,7 @@ azure-communication-identity
 azure-communication-phonenumbers
 azure-communication-sms
 azure-identity
+phdi @ git+https://github.com/CDCgov/phdi.git@main
 fastapi
 uvicorn
 httpx
diff --git a/phdi/containers/base_service.py b/phdi/containers/base_service.py
index d09cd39438..451bf09de7 100644
--- a/phdi/containers/base_service.py
+++ b/phdi/containers/base_service.py
@@ -1,5 +1,6 @@
 from fastapi import FastAPI
 from pathlib import Path
+from importlib import metadata
 
 
 class BaseService:
@@ -27,7 +28,7 @@ def __init__(
         self.include_health_check_endpoint = include_health_check_endpoint
         self.app = FastAPI(
             title=service_name,
-            version="0.0.1",
+            version=metadata.version("phdi"),
             contact={
                 "name": "CDC Public Health Data Infrastructure",
                 "url": "https://cdcgov.github.io/phdi-site/",
diff --git a/pyproject.toml b/pyproject.toml
index 8bec60c132..988c38ce16 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,11 +1,11 @@
 [tool.poetry]
 name = "phdi"
-version = "0.1.0.dev1"
+version = "v0.1.0.dev1"
 description = "Public health data infrastructure Building Blocks is a library to help public health departments work with their data"
 authors = ["Kenneth Chow <kenneth@skylight.digital>", "Brandon Mader <brandon@skylight.digital>", "Spencer Kathol <spencer@skylight.digital>"]
 homepage = "https://github.com/CDCgov/phdi"
 repository = "https://github.com/CDCgov/phdi"
-documentation = "https://cdcgov.github.io/phdi/v0.1.0-dev/"
+documentation = "https://cdcgov.github.io/phdi"
 readme = "README.md"
 
 [tool.poetry.dependencies]
diff --git a/tests/containers/test_base_service.py b/tests/containers/test_base_service.py
index 5783347132..294054d2ad 100644
--- a/tests/containers/test_base_service.py
+++ b/tests/containers/test_base_service.py
@@ -1,6 +1,7 @@
 from phdi.containers.base_service import BaseService
 from fastapi.testclient import TestClient
 from pathlib import Path
+from importlib import metadata
 
 
 def test_base_service():
@@ -8,7 +9,7 @@ def test_base_service():
         "test_service", Path(__file__).parent.parent / "assets" / "test_description.md"
     )
     assert service.app.title == "test_service"
-    assert service.app.version == "0.0.1"
+    assert service.app.version == metadata.version("phdi")
     assert service.app.contact == {
         "name": "CDC Public Health Data Infrastructure",
         "url": "https://cdcgov.github.io/phdi-site/",
diff --git a/tests/not_valid_json_test.json b/tests/not_valid_json_test.json
new file mode 100644
index 0000000000..bb73d201e6
--- /dev/null
+++ b/tests/not_valid_json_test.json
@@ -0,0 +1 @@
+this is a random string that is not in json format
diff --git a/tests/test_phdi_building_blocks.py b/tests/test_phdi_building_blocks.py
index e4bb7a24b0..46ec72e42e 100644
--- a/tests/test_phdi_building_blocks.py
+++ b/tests/test_phdi_building_blocks.py
@@ -1,10 +1,12 @@
+from importlib import metadata
 import toml
-
-from phdi import __version__
 from pathlib import Path
 
 
 def test_version():
     with open(Path(__file__).parent.parent / "pyproject.toml") as project_config_file:
         project_config = toml.load(project_config_file)
-        assert __version__ == project_config["tool"]["poetry"]["version"]
+        assert (
+            "v" + metadata.version("phdi")
+            == project_config["tool"]["poetry"]["version"]
+        )

From f81ecde9af29ca22536c00f2f4d21fbc6d484e31 Mon Sep 17 00:00:00 2001
From: Nick Clyde <nick@clyde.tech>
Date: Wed, 3 May 2023 14:09:58 -0700
Subject: [PATCH 04/12] Generate docs for containers in release workflow (#525)

* Created new github workflow for New Release - Creates new tag, then new release based upon new tag, builds containers for new tag/release, updates documentation for new tag/release

* Created workflow to create new release with tags with updated containers and updated documentation

* Update pyproject.toml

* Automated release of container docs

* Add permissions for build containers

* Add another permission

* Use shared workflow for list containers

* Target phdi release in requirements.txt in container builds

* Target nickclyde repo

* Consistently use 'app' instead of 'api'

* Use matrix strategy

* Try fromJson

* Add line to create containers folder

* Fix workflow warnings

* Add file for git to containers folder

* Make containers dir in advance

* Move commit to separate job, use artifacts

* Switch back to CDCgov

---------

Co-authored-by: Brady Fausett at Skylight <brady@skylight.digital>
---
 .github/workflows/buildReleaseContainers.yaml | 31 ++-----
 .github/workflows/createNewRelease.yaml       | 90 ++++++++++++++++---
 .github/workflows/listContainers.yaml         | 26 ++++++
 .github/workflows/test.yaml                   | 21 +----
 containers/alerts/Dockerfile                  |  6 +-
 containers/alerts/app/main.py                 | 10 +--
 containers/alerts/tests/test_alerts.py        |  4 +-
 containers/fhir-converter/Dockerfile          |  2 +-
 containers/fhir-converter/app/main.py         |  6 +-
 .../tests/test_FHIR-Converter.py              |  4 +-
 utils/make_openapi_json.py                    | 25 ++++++
 11 files changed, 159 insertions(+), 66 deletions(-)
 create mode 100644 .github/workflows/listContainers.yaml
 create mode 100644 utils/make_openapi_json.py

diff --git a/.github/workflows/buildReleaseContainers.yaml b/.github/workflows/buildReleaseContainers.yaml
index 35928d4442..78f78b42ea 100644
--- a/.github/workflows/buildReleaseContainers.yaml
+++ b/.github/workflows/buildReleaseContainers.yaml
@@ -14,34 +14,14 @@ on:
 
 jobs:
   list-containers:
-    # Get a list of all the directories within containers/.
-    runs-on: ubuntu-latest
-    defaults:
-      run:
-        working-directory: ./containers
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-          ref: ${{ inputs.container-tag }}
-      - name: Install dependencies
-        run: |
-          sudo apt-get install jq
-      - id: make-list
-        name: Generate list of directories within containers/
-        # use jq to produce json output and filter out the empty item caused by final newline
-        run: |
-          echo "::set-output name=containers::$(ls -d * | jq -R -s -c 'split("\n")[:-1]')"
-    outputs:
-      container-dirs: ${{steps.make-list.outputs.containers}}
+    uses: ./.github/workflows/listContainers.yaml
   build:
     name: Build containers
     needs: list-containers
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        container-to-build: ${{fromJson(needs.list-containers.outputs.container-dirs)}}
+        container-to-build: ${{fromJson(needs.list-containers.outputs.containers)}}
     permissions:
       contents: "read"
       id-token: "write"
@@ -67,7 +47,6 @@ jobs:
         id: meta
         uses: docker/metadata-action@v4
         with:
-          ref: ${{ inputs.container-tag }}
           images: ghcr.io/${{ github.repository }}/${{matrix.container-to-build}}
           # this sets the version for tags and labels for each of the containers to be
           # be the same as the version/tag where the code was pulled from
@@ -78,6 +57,12 @@ jobs:
           labels: |
             org.opencontainers.image.version=${{ inputs.container-tag }}
 
+      - name: Target phdi release in requirements.txt
+        working-directory: ./containers/${{matrix.container-to-build}}
+        run: |
+          sed 's/phdi @ git+https:\/\/github.com\/CDCgov\/phdi.git@main/phdi @ git+https:\/\/github.com\/CDCgov\/phdi.git@${{ inputs.container-tag }}/g' requirements.txt > requirements_new.txt && mv requirements_new.txt requirements.txt
+
+
       - name: Build and push
         uses: docker/build-push-action@v3
         with:
diff --git a/.github/workflows/createNewRelease.yaml b/.github/workflows/createNewRelease.yaml
index dc0910dd9e..4a266b4559 100644
--- a/.github/workflows/createNewRelease.yaml
+++ b/.github/workflows/createNewRelease.yaml
@@ -3,17 +3,16 @@ on:
   workflow_dispatch:
     inputs:
       custom_tag:
-        name: Set Release Tag Version
         description: "Enter version number for release tag below. Don't forget the v! Example: v2.23.9"
         type: string
         required: true
 
 # Run all tests before making a release
 jobs:
+  list-containers:
+    uses: ./.github/workflows/listContainers.yaml
   test-for-release:
     uses: ./.github/workflows/test.yaml
-
-
   update-version-tag-release:
     name: Update phdi init version number
     needs: test-for-release
@@ -51,12 +50,16 @@ jobs:
 # Rebuild all containers for the new release
   build-containers-for-release:
     needs: update-version-tag-release
+    permissions:
+      contents: read
+      packages: write
+      id-token: write
     uses: ./.github/workflows/buildReleaseContainers.yaml
     with:
       container-tag: ${{ github.event.inputs.custom_tag }}
 
-# Create updated docs for the latest release
-  generate-and-update-docs:
+# Create updated PHDI docs for the latest release
+  generate-and-update-phdi-docs:
     needs: build-containers-for-release
     permissions:
       contents: write
@@ -74,12 +77,79 @@ jobs:
 
       - name: Generate docs and move to docs branch
         run: |
-          poetry run pdoc ./phdi -o ./docs/${{ github.event.inputs.custom_tag }}
-          git checkout docs --
+          poetry run pdoc ./phdi -o ./docs/${{ github.event.inputs.custom_tag }}/sdk
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: phdi-docs
+          path: ./docs/${{ github.event.inputs.custom_tag }}/sdk
+
+  # Create updated container docs for the latest release
+  generate-and-update-container-docs:
+    needs:
+      - list-containers
+      - generate-and-update-phdi-docs
+    permissions:
+      contents: write
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        container: ${{fromJson(needs.list-containers.outputs.containers)}}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: '0'
+          ref: ${{ github.event.inputs.custom_tag }}
+
+      - name: Update Container Documenation
+        run: |
+          npm i -g redoc-cli
+          CONTAINER=${{ matrix.container }}
+          cd $GITHUB_WORKSPACE/containers/$CONTAINER
+          cp $GITHUB_WORKSPACE/utils/make_openapi_json.py .
+          pip install -r requirements.txt
+          python make_openapi_json.py
+          redoc-cli build -o $GITHUB_WORKSPACE/docs/${{ github.event.inputs.custom_tag }}/containers/$CONTAINER.html openapi.json
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: container-docs
+          path: ./docs/${{ github.event.inputs.custom_tag }}/containers
+
+  commit-docs:
+    needs:
+      - generate-and-update-phdi-docs
+      - generate-and-update-container-docs
+    permissions:
+      contents: write
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          ref: docs
+
+      - name: Download phdi docs from artifacts
+        uses: actions/download-artifact@v2
+        with:
+          name: phdi-docs
+          path: ./docs/${{ github.event.inputs.custom_tag }}/sdk
+
+      - name: Download container docs from artifacts
+        uses: actions/download-artifact@v2
+        with:
+          name: container-docs
+          path: ./docs/${{ github.event.inputs.custom_tag }}/containers
+
+      - name: Copy to latest folder
+        run: |
           rm -rf ./docs/latest
-          cp -r ./docs/${{ github.event.inputs.custom_tag }} ./docs/latest
+          mkdir -p ./docs/latest/sdk
+          mkdir -p ./docs/latest/containers
+          cp -r ./docs/${{ github.event.inputs.custom_tag }}/sdk/* ./docs/latest/sdk
+          cp -r ./docs/${{ github.event.inputs.custom_tag }}/containers/* ./docs/latest/containers
 
-      - uses: EndBug/add-and-commit@v9
+      - name: Commit New Documentation
+        uses: EndBug/add-and-commit@v9
         with:
-          message: Automated update of API docs for ${{ github.event.inputs.custom_tag }} release.
           add: docs
+          message: Automated update of docs for ${{ github.event.inputs.custom_tag }} release.
diff --git a/.github/workflows/listContainers.yaml b/.github/workflows/listContainers.yaml
new file mode 100644
index 0000000000..9767ac356a
--- /dev/null
+++ b/.github/workflows/listContainers.yaml
@@ -0,0 +1,26 @@
+name: "List Containers"
+
+on:
+  workflow_call:
+    outputs:
+      containers: 
+        value: ${{ jobs.list-containers.outputs.container-dirs }}
+
+jobs:
+  list-containers:
+    runs-on: ubuntu-latest
+    outputs:
+      container-dirs: ${{steps.generate-list.outputs.containers}}
+    steps:
+      - name: Check Out Changes
+        uses: actions/checkout@v3
+      - name: Install dependencies
+        run: |
+          sudo apt-get install jq
+      - id: generate-list
+        name: Generate list of directories within containers/
+        working-directory: ./containers
+        # use jq to produce json output and filter out the empty item caused by final newline
+        run: |
+          ls -d * | jq -R -s -c 'split("\n")[:-1]'
+          echo "containers=$(ls -d */ | cut -f1 -d'/' | jq -R -s -c 'split("\n")[:-1]')" >> $GITHUB_OUTPUT
\ No newline at end of file
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index dfea105a19..6c6b6a292b 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -120,24 +120,7 @@ jobs:
         uses: ResearchSoftwareActions/EnsureCleanNotebooksAction@1.1
 
   list-containers:
-    # Get a list of all the directories within containers/.
-    runs-on: ubuntu-latest
-    defaults:
-      run:
-        working-directory: ./containers
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-      - name: Install dependencies
-        run: |
-          sudo apt-get install jq
-      - id: make-list
-        name: Generate list of directories within containers/
-        # use jq to produce json output and filter out the empty item caused by final newline
-        run: |
-          echo "::set-output name=containers::$(ls -d */ | jq -R -s -c 'split("\n")[:-1]')"
-    outputs:
-      container-dirs: ${{steps.make-list.outputs.containers}}
+    uses: ./.github/workflows/listContainers.yaml
 
   unit-test-python-containers:
     needs: list-containers
@@ -161,7 +144,7 @@ jobs:
           - 5432:5432
     strategy:
       matrix:
-        container-to-test: ${{fromJson(needs.list-containers.outputs.container-dirs)}}
+        container-to-test: ${{fromJson(needs.list-containers.outputs.containers)}}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
diff --git a/containers/alerts/Dockerfile b/containers/alerts/Dockerfile
index bc87342815..ac6c65012e 100644
--- a/containers/alerts/Dockerfile
+++ b/containers/alerts/Dockerfile
@@ -2,6 +2,10 @@ FROM python:3.10-slim
 
 WORKDIR /code
 
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y git
+
 COPY ./requirements.txt /code/requirements.txt
 RUN pip install -r requirements.txt
 
@@ -9,4 +13,4 @@ COPY ./app /code/app
 COPY ./description.md /code/description.md
 
 EXPOSE 8080
-CMD uvicorn app.main:api --host 0.0.0.0 --port 8080
\ No newline at end of file
+CMD uvicorn app.main:app --host 0.0.0.0 --port 8080
\ No newline at end of file
diff --git a/containers/alerts/app/main.py b/containers/alerts/app/main.py
index c0b5415fea..dbcbb9bcc1 100644
--- a/containers/alerts/app/main.py
+++ b/containers/alerts/app/main.py
@@ -20,7 +20,7 @@
 
 
 # Instantiate FastAPI and set metadata.
-api = BaseService(
+app = BaseService(
     "PHDI Alerts Service", Path(__file__).parent.parent / "description.md"
 ).start()
 
@@ -61,7 +61,7 @@ class TeamsAlertInput(BaseModel):
     message: str = Field(description="The message to send to the Teams channel.")
 
 
-@api.get("/")
+@app.get("/")
 async def health_check():
     """
     Check service status. If an HTTP 200 status code is returned along with
@@ -70,7 +70,7 @@ async def health_check():
     return {"status": "OK"}
 
 
-@api.post("/sms-alert", status_code=200)
+@app.post("/sms-alert", status_code=200)
 async def sms_alert(input: SmsAlertInput, response: Response):
     """
     Send an SMS alert to a phone number.
@@ -97,7 +97,7 @@ async def sms_alert(input: SmsAlertInput, response: Response):
     )
 
 
-@api.post("/slack-alert", status_code=200)
+@app.post("/slack-alert", status_code=200)
 async def slack_alert(input: SlackAlertInput, response: Response):
     """
     Send a Slack alert to a channel.
@@ -124,7 +124,7 @@ async def slack_alert(input: SlackAlertInput, response: Response):
         return e.response
 
 
-@api.post("/teams-alert", status_code=200)
+@app.post("/teams-alert", status_code=200)
 async def teams_alert(input: TeamsAlertInput, response: Response):
     """
     Send a Teams alert to a channel.
diff --git a/containers/alerts/tests/test_alerts.py b/containers/alerts/tests/test_alerts.py
index 40c62b80cf..69204b5f14 100644
--- a/containers/alerts/tests/test_alerts.py
+++ b/containers/alerts/tests/test_alerts.py
@@ -3,9 +3,9 @@
 from fastapi.testclient import TestClient
 from types import SimpleNamespace
 
-from app.main import api
+from app.main import app
 
-client = TestClient(api)
+client = TestClient(app)
 
 
 def test_health_check():
diff --git a/containers/fhir-converter/Dockerfile b/containers/fhir-converter/Dockerfile
index 884fb9d86a..f7aa6488ea 100644
--- a/containers/fhir-converter/Dockerfile
+++ b/containers/fhir-converter/Dockerfile
@@ -35,4 +35,4 @@ COPY app/main.py .
 COPY Templates/eCR /build/FHIR-Converter/data/Templates/eCR
 
 EXPOSE 8080
-CMD uvicorn main:api --host 0.0.0.0 --port 8080
\ No newline at end of file
+CMD uvicorn main:app --host 0.0.0.0 --port 8080
\ No newline at end of file
diff --git a/containers/fhir-converter/app/main.py b/containers/fhir-converter/app/main.py
index e75c008ef8..0671df75ae 100644
--- a/containers/fhir-converter/app/main.py
+++ b/containers/fhir-converter/app/main.py
@@ -7,7 +7,7 @@
 from pydantic import BaseModel
 
 
-api = FastAPI()
+app = FastAPI()
 
 
 class InputType(str, Enum):
@@ -97,12 +97,12 @@ class FhirConverterInput(BaseModel):
     root_template: RootTemplate
 
 
-@api.get("/")
+@app.get("/")
 async def health_check():
     return {"status": "OK"}
 
 
-@api.post("/convert-to-fhir", status_code=200)
+@app.post("/convert-to-fhir", status_code=200)
 async def convert(input: FhirConverterInput, response: Response):
     result = convert_to_fhir(**dict(input))
     if "original_request" in result.get("response"):
diff --git a/containers/fhir-converter/tests/test_FHIR-Converter.py b/containers/fhir-converter/tests/test_FHIR-Converter.py
index 0d881c1242..0a325442ef 100644
--- a/containers/fhir-converter/tests/test_FHIR-Converter.py
+++ b/containers/fhir-converter/tests/test_FHIR-Converter.py
@@ -2,9 +2,9 @@
 from unittest import mock
 from fastapi.testclient import TestClient
 import json
-from app.main import api
+from app.main import app
 
-client = TestClient(api)
+client = TestClient(app)
 
 valid_request = {
     "input_data": "VALID_INPUT_DATA",
diff --git a/utils/make_openapi_json.py b/utils/make_openapi_json.py
new file mode 100644
index 0000000000..25ac0df5ee
--- /dev/null
+++ b/utils/make_openapi_json.py
@@ -0,0 +1,25 @@
+from fastapi.openapi.utils import get_openapi
+import json
+
+try:
+    from app.main import app
+except ModuleNotFoundError:
+    from main import app
+"""
+This is a simple script that writes the OpenAPI schema for a FastAPI application to
+a JSON file. This JSON can then be used with a tool like redoc-cli to generate a static
+HTML version of the API documentation served by a FastAPI applications at the /docs
+endpoint.
+"""
+
+with open("openapi.json", "w") as f:
+    json.dump(
+        get_openapi(
+            title=app.title,
+            version=app.version,
+            openapi_version=app.openapi_version,
+            description=app.description,
+            routes=app.routes,
+        ),
+        f,
+    )

From 507723c75271fd8f4ae3a8c11e1385ab5e356a82 Mon Sep 17 00:00:00 2001
From: m-goggins <53578688+m-goggins@users.noreply.github.com>
Date: Thu, 4 May 2023 15:08:37 -0700
Subject: [PATCH 05/12] Add Record Linkage SQL Postmortem (#529)

* add postmortem template

* 1st pass

* updates based on 5.4.23 meeting

* fix typos and formatting

---------

Co-authored-by: emmastephenson <emma.stephenson.24@gmail.com>
---
 .../20230504_record_linkage_sql_postmortem.md | 67 +++++++++++++++++++
 docs/Postmortems/_postmortem_template.md      | 65 ++++++++++++++++++
 2 files changed, 132 insertions(+)
 create mode 100644 docs/Postmortems/20230504_record_linkage_sql_postmortem.md
 create mode 100644 docs/Postmortems/_postmortem_template.md

diff --git a/docs/Postmortems/20230504_record_linkage_sql_postmortem.md b/docs/Postmortems/20230504_record_linkage_sql_postmortem.md
new file mode 100644
index 0000000000..9568d4925a
--- /dev/null
+++ b/docs/Postmortems/20230504_record_linkage_sql_postmortem.md
@@ -0,0 +1,67 @@
+# 2023-05-04 Record Linkage SQL Injection Postmortem
+
+**Postmortem Owner:** Marcelle, Dan
+
+**Meeting Scheduled For:** Engineering Sync @ 10:30am PT on 5/4/23
+
+## Overview
+During an end-to-end run of the DIBBs pipeline with synthetic data, we discovered an error where a FHIR bundle [could not connect](https://skylight-hq.slack.com/archives/C03UF70CKGE/p1682360691930109?thread_ts=1682353411.011679&cid=C03UF70CKGE) to the MPI database for record linkage because the MRN contained an apostrophe, which early-terminated the SQL code used to retrieve blocking data from the MPI database. 
+
+## Contributing Factors
+- The queries to the MPI did not include any measures to prevent SQL injection attacks, even unintentional ones such as "Patient's Medical Record Number".
+- When switching code contexts, i.e., from Python to SQL, we did not consider ramifications of using a different language and framework, such as externally connecting to PHI. 
+- The test data we used initially did not include any single quotes and the test data that uncovered this issue included a single quote by accident; we were lucky this was discovered at all!
+
+## Resolution
+We added functionality to sanitize the SQL queries ([#512](https://app.zenhub.com/workspaces/dibbs-63f7aa3e1ecdbb0011edb299/issues/gh/cdcgov/phdi/512)), moving from raw SQL statements like "SELECT * FROM table;" to using [SQL composition](https://realpython.com/prevent-python-sql-injection/#passing-safe-query-parameters) to pass in parameters for the queries as [Literals](https://www.psycopg.org/docs/sql.html#psycopg2.sql.Literal). This took a little extra time because of the specifics of our queries, i.e., jsonb queries necessitate a lot of single quotes. 
+
+Future resolutions/bigger picture items to consider:
+- How can we handle switching languages/contexts both in development and code review?
+- How can we develop more robust test data to potentially uncover issues like this earlier?
+
+## Impact
+10+ hours debugging and implementing solution for 1-3 engineers
+
+## Timeline
+**Time (ET)**|**Event**
+:-----:|:-----:
+03-29-2023|MPI query code implemented
+04-23-2023|Problem discovered with end-to-end pipeline testing
+04-24-2023|Resolution implemented
+
+
+## How’d We Do?
+All the following sections should be filled out together as a team during the postmortem meeting.
+
+### What Went Well?
+- List anything the team did well and want to call out.
+- Lots of pairing and collaboration
+- Fairly quick turnaround time (2 days) considering the change to the approach
+
+### Where Did We Get Lucky?
+- The test file happened to have an apostrophe; not all patient medical record numbers included them
+
+### What Didn’t Go So Well?
+- List anything that could have gone better. The intent is that we should follow up on all points here to improve our processes.
+- Error messages were not robust; the issue wasn't that we couldn't connect to the DB
+
+### What Did We Learn?
+- List any findings that came out of the incident.
+- Using as close to real data when testing
+- More integration tests
+
+## Potential Action Items
+Explore potential action items grouped by the themes discussed in What Didn’t Go So Well. 
+- Re-organize assets by what the data _is_, e.g., FHIR bundles that can be re-used for testing across BBs
+- Update RL error messages
+- Update call to RL endpoint such that there are 3 separate try/except blocks instead of 1 try/except with 3 SDK functions within the same block; more intentional about designing try/except blocks
+- Spike: Investigate other DB connection packages, e.g., SQLAlchemy vs. pyscopg
+- Adjusting error response to not include the entire bundle (when failing?) so that it is easier to see the error message OR return message before the FHIR bundle so it is easier to see the message contents
+
+
+## Action Items
+The action items we are committing to from the potential action Items. Each action item should be in the form of a Zenhub ticket.
+- Error handling in RL endpoint
+- Re-organizing response to include message earlier on
+- Wash your hands and your sql strings (sanitize)
+
diff --git a/docs/Postmortems/_postmortem_template.md b/docs/Postmortems/_postmortem_template.md
new file mode 100644
index 0000000000..95097bf6e8
--- /dev/null
+++ b/docs/Postmortems/_postmortem_template.md
@@ -0,0 +1,65 @@
+# YYYY-MM-DD Title
+
+**Postmortem Owner:** Your name goes here.
+
+**Meeting Scheduled For:** Schedule the meeting on the "Incident Postmortem Meetings" shared calendar, for within 5 business days after the incident. Put the date/time here.
+
+## Overview
+Include a short sentence or two summarizing the contributing factors, timeline summary, and the impact. E.g. "On the morning of August 99th, we suffered a 1 minute SEV-1 due to a runaway process on our primary database machine. This slowness caused roughly 0.024% of alerts that had begun during this time to be delivered out of SLA."
+
+## Contributing Factors
+Include a description of any conditions that contributed to the issue. If there were any actions taken that exacerbated the issue, also include them here with the intention of learning from any mistakes made during the resolution process.
+
+## Resolution
+Include a description of what solved the problem. If there was a temporary fix in place, describe that along with the long-term solution.
+
+## Impact
+Be very specific here and include exact numbers.
+
+## Timeline
+Some important times to include: 
+1. time the contributing factor began
+1. time of the page
+1. time that the status page was updated (i.e. when the incident became public)
+1. time of any significant actions
+1. time the SEV-2/1 ended
+1. links to tools/logs that show how the timestamp was arrived at.
+
+**Time (ET)**|**Event**
+:-----:|:-----:
+12-10-2021 12:31 PM|Description of an important event
+
+
+## How’d We Do?
+All the following sections should be filled out together as a team during the postmortem meeting.
+
+### What Went Well?
+- List anything the team did well and want to call out.
+
+### Where Did We Get Lucky?
+- List anything we got lucky on.
+
+### What Didn’t Go So Well?
+- List anything that could have gone better. The intent is that we should follow up on all points here to improve our processes.
+
+### What Did We Learn?
+- List any findings that came out of the incident.
+
+## Potential Action Items
+Explore potential action items grouped by the themes discussed in What Didn’t Go So Well. 
+
+Examples: 
+1. any fixes required to prevent the contributing factor in the future
+2. any preparedness tasks that could help mitigate the problem if it came up again
+3. any improvements to our incident response process (pages, alert thresholds, etc).
+
+## Action Items
+The action items we are committing to from the potential action Items. Each action item should be in the form of a Zenhub ticket.
+
+## Messaging
+
+### Internal
+This is a follow-up for employees. It should be sent out right after the postmortem meeting is over. It only needs a short paragraph summarizing the incident and a link to this wiki page.
+
+### External
+What are we telling customers, including an apology? (The apology should be genuine, not rote.)

From 360ad5d87e634f3e184074327fcd0ff516dd7fe4 Mon Sep 17 00:00:00 2001
From: Daniel Paseltiner <99684231+DanPaseltiner@users.noreply.github.com>
Date: Thu, 4 May 2023 21:06:13 -0400
Subject: [PATCH 06/12] Add the kafka to delta table streaming service (#359)

* Change to psycopg2-binary.

* update lock file

* Initial commit with working, but imcomplete kafka-to-delta-table service.

* Progress on schema handling

* Finish adding support for handling schemas.

* Update description introduction

* progress on schema support

* Add test for get_spark_schema.

* Install phdi from main branch.

* Add test for validate_schema

* Test adlsgen2 connector

* Expand adls gen2 connector test

* rename /test to /tests

* Test load_schema

* fix typo

* Add tests for the /kafka-to-delta-table endpoint

* black

* black

* Migrate to using AzureCredentialManager for getting key vault secrets.

* black

* Refactor kafka_to_delta.py

* Tests and associated refactors for get_arguments().

* black

* test set_selection_flags()

* test main() and test refactors to keep things DRY.

* black

* flake8 in main.py

* flake8 fixes

* Fix typo

* added passing unit tests for kafka connectors

* forgot to run black

* separated lines for  variable

* partial doc update with instructions for running from Python source

* added full schema for the data elements from FHIR

* Use docker compose for kafka (#389)

Allows the use of docker compose for the kafka container

---------

Co-authored-by: DanPaseltiner <dan@skylight.digital>
Co-authored-by: Nick Clyde <nick@clyde.tech>

* Update containers/kafka-to-delta-table/Dockerfile

remove duplicate -y flag

---------

Co-authored-by: robertmitchellv <robert.beaty.mitchell@gmail.com>
Co-authored-by: Brady Fausett at Skylight <brady@skylight.digital>
Co-authored-by: Nick <nick@skylight.digital>
Co-authored-by: Nick Clyde <nick@clyde.tech>
---
 .gitignore                                    |   2 +
 containers/kafka-to-delta-table/.env          |   6 +
 containers/kafka-to-delta-table/Dockerfile    |  21 +-
 .../kafka-to-delta-table/app/data_to_kafka.py | 125 ++++++++
 .../app/default_schemas/ecr.json              |  44 ++-
 .../kafka-to-delta-table/app/get_delta.py     |  72 +++++
 .../app/kafka_connectors.py                   |  28 +-
 .../app/kafka_to_delta.py                     |  26 +-
 containers/kafka-to-delta-table/app/main.py   | 274 +++++++++++++++++-
 .../app/storage_connectors.py                 |   2 +-
 .../kafka-to-delta-table/docker-compose.yml   |  66 +++++
 .../kafka-to-delta-table/requirements.txt     |   4 +-
 .../tests/test_data_to_kafka.py               |  91 ++++++
 .../tests/test_kafka_connectors.py            |   3 +
 .../kafka-to-delta-table/tests/test_main.py   |  76 +++++
 .../tests/test_storage_connectors.py          |   6 +-
 16 files changed, 819 insertions(+), 27 deletions(-)
 create mode 100644 containers/kafka-to-delta-table/.env
 create mode 100644 containers/kafka-to-delta-table/app/data_to_kafka.py
 create mode 100644 containers/kafka-to-delta-table/app/get_delta.py
 create mode 100644 containers/kafka-to-delta-table/docker-compose.yml
 create mode 100644 containers/kafka-to-delta-table/tests/test_data_to_kafka.py
 create mode 100644 containers/kafka-to-delta-table/tests/test_main.py

diff --git a/.gitignore b/.gitignore
index 5c9db89a2c..184d53509f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -67,3 +67,5 @@ data/
 
 # VS Code
 /.vscode/settings.json
+
+persistent_storage/
\ No newline at end of file
diff --git a/containers/kafka-to-delta-table/.env b/containers/kafka-to-delta-table/.env
new file mode 100644
index 0000000000..b498ff928d
--- /dev/null
+++ b/containers/kafka-to-delta-table/.env
@@ -0,0 +1,6 @@
+# x64 users
+JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
+# M1 users
+# JAVA_HOME=/usr/lib/jvm/java-11-openjdk-arm64
+PYSPARK_PYTHON=/usr/local/bin/python
+PYSPARK_DRIVER_PYTHON=/usr/local/bin/python
\ No newline at end of file
diff --git a/containers/kafka-to-delta-table/Dockerfile b/containers/kafka-to-delta-table/Dockerfile
index c77114b112..0abb6abee9 100644
--- a/containers/kafka-to-delta-table/Dockerfile
+++ b/containers/kafka-to-delta-table/Dockerfile
@@ -4,18 +4,25 @@ WORKDIR /code
 
 RUN apt-get update && \
     apt-get upgrade -y && \
-    apt-get install -y git && \
-    apt-get install -y wget
+    apt-get install -y git wget procps gcc
 
+RUN apt-get install -y gnupg ca-certificates software-properties-common  openjdk-11-jdk openjdk-11-jre
+
+# Add Java binaries to the system path
+ENV PATH $JAVA_HOME/bin:$PATH
+
+RUN pip install --upgrade pip
 COPY ./requirements.txt /code/requirements.txt
 RUN pip install -r requirements.txt
 
-RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/3.3.2/hadoop-common-3.3.2.jar  -o /usr/local/lib/python3.10/site-packages/pyspark/jars/hadoop-common-3.3.2.jar
-RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.2/hadoop-azure-3.3.2.jar    -o /usr/local/lib/python3.10/site-packages/pyspark/jars/hadoop-azure-3.3.2.jar
-RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure-datalake/3.3.2/hadoop-azure-datalake-3.3.2.jar -o /usr/local/lib/python3.10/site-packages/pyspark/jars/hadoop-azure-datalake-3.3.2.jar 
+RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/3.1.3/hadoop-common-3.1.3.jar  -o /usr/local/lib/python3.10/site-packages/pyspark/jars/hadoop-common-3.1.3.jar
+RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.1.3/hadoop-azure-3.1.3.jar    -o /usr/local/lib/python3.10/site-packages/pyspark/jars/hadoop-azure-3.1.3.jar
+RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure-datalake/3.1.3/hadoop-azure-datalake-3.1.3.jar -o /usr/local/lib/python3.10/site-packages/pyspark/jars/hadoop-azure-datalake-3.1.3.jar
 
 COPY ./app /code/app
+COPY ./app/kafka_to_delta.py /code/kafka_to_delta.py
+COPY ./app/data_to_kafka.py /code/data_to_kafka.py
+COPY ./app/get_delta.py /code/get_delta.py
 COPY ./description.md /code/description.md
-
 EXPOSE 8080
-CMD uvicorn app.main:app --host 0.0.0.0 --port 8080
\ No newline at end of file
+CMD uvicorn app.main:app --host 0.0.0.0 --port 8080
diff --git a/containers/kafka-to-delta-table/app/data_to_kafka.py b/containers/kafka-to-delta-table/app/data_to_kafka.py
new file mode 100644
index 0000000000..2e0a9d12b1
--- /dev/null
+++ b/containers/kafka-to-delta-table/app/data_to_kafka.py
@@ -0,0 +1,125 @@
+from app.storage_connectors import STORAGE_PROVIDERS
+from pyspark.sql import SparkSession
+from app.utils import get_spark_schema
+import sys
+import argparse
+import json
+from pyspark.sql.functions import to_json, struct
+from app.kafka_connectors import KAFKA_PROVIDERS, create_kafka_data_frame
+
+
+def set_selection_flags(arguments: list) -> dict:
+    """
+    Sets the value of the selection_flags dictionary to True if the corresponding
+    command line argument is present in the list of arguments.
+
+    :param arguments: A list of command line arguments.
+    :return: A dictionary containing values indicating which Kafka and storage providers
+        are selected.
+    """
+    selection_flags = {}
+    providers = KAFKA_PROVIDERS.__args__ + STORAGE_PROVIDERS.__args__
+    for flag in providers:
+        if flag in arguments:
+            selection_flags[flag] = True
+        else:
+            selection_flags[flag] = False
+
+    return selection_flags
+
+
+def get_arguments(arguments: list, selection_flags: dict) -> argparse.Namespace:
+    """
+    Parses command line arguments.
+
+    :param arguments: A list of command line arguments.
+    :param selection_flags: A dictionary containing values indicating which Kafka and
+        storage providers are selected.
+    :return: An argparse.Namespace object containing the parsed arguments.
+    """
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--storage_provider",
+        choices=["local_storage"],
+        type=str,
+        required=True,
+        help="The type of storage resource that will be written to",
+    )
+    parser.add_argument(
+        "--kafka_provider",
+        choices=["local_kafka"],
+        type=str,
+        required=True,
+        help="The type of kafka cluster to read from.",
+    )
+    parser.add_argument(
+        "--kafka_server",
+        type=str,
+        required=selection_flags["local_kafka"],
+        help="The URL of a Kafka server including port.",
+    )
+
+    parser.add_argument(
+        "--kafka_topic",
+        type=str,
+        default="",
+        required=selection_flags["local_kafka"],
+        help="The name of a Kafka topic to read from.",
+    )
+
+    parser.add_argument(
+        "--schema",
+        type=str,
+        required=True,
+        help="The schema of the data to be written to the Delta table as a JSON string "
+        "with the form '{'field1': 'type1', 'field2': 'type2'}'.",
+    )
+    parser.add_argument(
+        "--data",
+        type=str,
+        required=True,
+        help="The data to be written to the kafka table as a JSON string.",
+    )
+
+    return parser.parse_args(arguments)
+
+
+def main():
+    """
+    Upload data to kafka instance
+    """
+    arguments_list = sys.argv[1:]
+    selection_flags = set_selection_flags(arguments_list)
+    arguments = get_arguments(arguments_list, selection_flags)
+
+    spark = (
+        SparkSession.builder.master("local[*]")
+        .appName("kafka-to-delta-table")
+        .config("spark.sql.debug.maxToStringFields", "100")
+        .getOrCreate()
+    )
+    spark.sparkContext.setLogLevel("ALL")
+
+    schema = get_spark_schema(arguments.schema)
+
+    if selection_flags["local_kafka"]:
+        kafka_data_frame = create_kafka_data_frame(
+            spark,
+            schema,
+            data=json.loads(arguments.data),
+        )
+    df_json = kafka_data_frame.select(to_json(struct("*")).alias("value"))
+
+    (
+        df_json.write.format("kafka")
+        .option("kafka.bootstrap.servers", arguments.kafka_server)
+        .option("topic", arguments.kafka_topic)
+        .save()
+    )
+
+    sys.exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/containers/kafka-to-delta-table/app/default_schemas/ecr.json b/containers/kafka-to-delta-table/app/default_schemas/ecr.json
index 6404fcc6e7..e6d91ac789 100644
--- a/containers/kafka-to-delta-table/app/default_schemas/ecr.json
+++ b/containers/kafka-to-delta-table/app/default_schemas/ecr.json
@@ -1,4 +1,44 @@
 {
-    "first_name":"string",
-    "last_name":"string"
+    "patient_id": "string",
+    "person_id": "string",
+    "last_name": "string",
+    "first_name": "string",
+    "rr_id": "string",
+    "status": "string",
+    "conditions": "string",
+    "eicr_id": "string",
+    "eicr_version_number": "integer",
+    "authoring_datetime": "timestamp",
+    "provider_id": "string",
+    "facility_id_number": "string",
+    "facility_name": "string",
+    "facility_type": "string",
+    "encounter_type": "string",
+    "encounter_start_date": "date",
+    "encounter_end_date": "date",
+    "active_problem_1": "string",
+    "active_problem_date_1": "date",
+    "active_problem_2": "string",
+    "active_problem_date_2": "date",
+    "active_problem_3": "string",
+    "active_problem_date_3": "date",
+    "active_problem_4": "string",
+    "active_problem_date_4": "date",
+    "active_problem_5": "string",
+    "active_problem_date_5": "date",
+    "reason_for_visit": "string",
+    "test_type_1": "string",
+    "test_result_1": "string",
+    "test_result_interp_1": "string",
+    "specimen_type_1": "string",
+    "performing_lab_1": "string",
+    "specimen_collection_date_1": "timestamp",
+    "result_date_1": "timestamp",
+    "test_type_2": "string",
+    "test_result_2": "string",
+    "test_result_interp_2": "string",
+    "specimen_type_2": "string",
+    "performing_lab_2": "string",
+    "specimen_collection_date_2": "timestamp",
+    "result_date_2": "timestamp"
 }
\ No newline at end of file
diff --git a/containers/kafka-to-delta-table/app/get_delta.py b/containers/kafka-to-delta-table/app/get_delta.py
new file mode 100644
index 0000000000..ee019c11e5
--- /dev/null
+++ b/containers/kafka-to-delta-table/app/get_delta.py
@@ -0,0 +1,72 @@
+from pyspark.sql import SparkSession
+import argparse
+import sys
+from app.kafka_connectors import KAFKA_PROVIDERS
+from app.storage_connectors import STORAGE_PROVIDERS
+
+
+def set_selection_flags(arguments: list) -> dict:
+    """
+    Sets the value of the selection_flags dictionary to True if the corresponding
+    command line argument is present in the list of arguments.
+
+    :param arguments: A list of command line arguments.
+    :return: A dictionary containing values indicating which Kafka and storage providers
+        are selected.
+    """
+    selection_flags = {}
+    providers = KAFKA_PROVIDERS.__args__ + STORAGE_PROVIDERS.__args__
+    for flag in providers:
+        if flag in arguments:
+            selection_flags[flag] = True
+        else:
+            selection_flags[flag] = False
+
+    return selection_flags
+
+
+def get_arguments(arguments: list) -> argparse.Namespace:
+    """
+    Parses command line arguments.
+
+    :param arguments: A list of command line arguments.
+    :param selection_flags: A dictionary containing values indicating which Kafka and
+        storage providers are selected.
+    :return: An argparse.Namespace object containing the parsed arguments.
+    """
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--delta_table_name",
+        type=str,
+        required=True,
+        help="The name of the Delta table to write to.",
+    )
+
+    return parser.parse_args(arguments)
+
+
+def main():
+    """
+    Submit a Spark job to read from a Kafka topic and write to a Delta table according
+    to configuration provided by command line arguments.
+    """
+    arguments_list = sys.argv[1:]
+    arguments = get_arguments(arguments_list)
+
+    spark = (
+        SparkSession.builder.master("local[*]")
+        .appName("kafka-to-delta-table")
+        .config("spark.sql.debug.maxToStringFields", "100")
+        .getOrCreate()
+    )
+    spark.sparkContext.setLogLevel("WARN")
+    base_path = "./persistent_storage/kafka/"
+    print("**ParquetTable**")
+    df = spark.read.parquet(base_path + arguments.delta_table_name)
+    df.show(10)
+    sys.exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/containers/kafka-to-delta-table/app/kafka_connectors.py b/containers/kafka-to-delta-table/app/kafka_connectors.py
index 83693794f7..c785a1a9bb 100644
--- a/containers/kafka-to-delta-table/app/kafka_connectors.py
+++ b/containers/kafka-to-delta-table/app/kafka_connectors.py
@@ -3,8 +3,10 @@
 from pyspark.sql import SparkSession, DataFrame
 from phdi.cloud.azure import AzureCredentialManager
 from typing import Literal
+import os
 
 KAFKA_PROVIDERS = Literal["local_kafka", "azure_event_hubs"]
+KAFKA_WRITE_DATA_PROVIDERS = Literal["local_kafka"]
 
 
 def connect_to_azure_event_hubs(
@@ -59,7 +61,11 @@ def connect_to_azure_event_hubs(
 
 
 def connect_to_local_kafka(
-    spark: SparkSession, schema: StructType, kafka_server: str, kafka_topic: str
+    spark: SparkSession,
+    schema: StructType,
+    kafka_server: str,
+    kafka_topic: str,
+    checkpoint_path: str,
 ) -> DataFrame:
     """
     Given a SparkSession object and a schema (StructType) read JSON data from a Kafka
@@ -70,14 +76,34 @@ def connect_to_local_kafka(
     :param kafka_server: The URL of a Kafka server including port.
     :param kafka_topic: The name of a Kafka topic.
     """
+
+    offsets = "latest" if os.path.exists(checkpoint_path) else "earliest"
     kafka_data_frame = (
         spark.readStream.format("kafka")
         .option("kafka.bootstrap.servers", kafka_server)
         .option("failOnDataLoss", "false")
         .option("subscribe", kafka_topic)
         .option("includeHeaders", "true")
+        .option("startingOffsets", offsets)
         .load()
         .select(from_json(col("value").cast("string"), schema).alias("parsed_value"))
         .select(col("parsed_value.*"))
     )
     return kafka_data_frame
+
+
+def create_kafka_data_frame(
+    spark: SparkSession,
+    schema: StructType,
+    data: list[dict],
+) -> DataFrame:
+    """
+    Given a SparkSession object and a schema (StructType) return a dataframe for writing
+    data.
+
+    :param spark: A SparkSession object to use for streaming data from Kafka.
+    :param schema: A schema describing the JSON values read from the topic.
+    :param data: A list of the data to be written
+    """
+    kafka_data_frame = spark.createDataFrame(data, schema)
+    return kafka_data_frame
diff --git a/containers/kafka-to-delta-table/app/kafka_to_delta.py b/containers/kafka-to-delta-table/app/kafka_to_delta.py
index 2215e927ca..54a4ebf20b 100644
--- a/containers/kafka-to-delta-table/app/kafka_to_delta.py
+++ b/containers/kafka-to-delta-table/app/kafka_to_delta.py
@@ -162,6 +162,11 @@ def main():
         SparkSession.builder.master("local[*]")
         .appName("kafka-to-delta-table")
         .config("spark.sql.debug.maxToStringFields", "100")
+        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
+        .config(
+            "spark.sql.catalog.spark_catalog",
+            "org.apache.spark.sql.delta.catalog.DeltaCatalog",
+        )
         .getOrCreate()
     )
     spark.sparkContext.setLogLevel("WARN")
@@ -179,6 +184,14 @@ def main():
         )
 
     schema = get_spark_schema(arguments.schema)
+
+    delta_table_path = (
+        base_path + f"{kafka_topic_mappings[arguments.kafka_provider]}-table"
+    )
+    checkpoint_path = (
+        base_path + f"{kafka_topic_mappings[arguments.kafka_provider]}-checkpoint"
+    )
+
     if selection_flags["azure_event_hubs"]:
         kafka_data_frame = connect_to_azure_event_hubs(
             spark,
@@ -191,16 +204,13 @@ def main():
 
     elif selection_flags["local_kafka"]:
         kafka_data_frame = connect_to_local_kafka(
-            spark, schema, arguments.kafka_server, arguments.kafka_topic
+            spark,
+            schema,
+            arguments.kafka_server,
+            arguments.kafka_topic,
+            checkpoint_path,
         )
 
-    delta_table_path = (
-        base_path + f"{kafka_topic_mappings[arguments.kafka_provider]}-table"
-    )
-    checkpoint_path = (
-        base_path + f"{kafka_topic_mappings[arguments.kafka_provider]}-checkpoint"
-    )
-
     query = (
         kafka_data_frame.writeStream.option("checkpointLocation", checkpoint_path)
         .outputMode("append")
diff --git a/containers/kafka-to-delta-table/app/main.py b/containers/kafka-to-delta-table/app/main.py
index 91a96cc29b..98899e33f6 100644
--- a/containers/kafka-to-delta-table/app/main.py
+++ b/containers/kafka-to-delta-table/app/main.py
@@ -1,11 +1,13 @@
 from phdi.containers.base_service import BaseService
 from pathlib import Path
+import json
 from typing import Literal
 from pydantic import BaseModel, Field, root_validator
-from fastapi import Response
-from app.kafka_connectors import KAFKA_PROVIDERS
+import subprocess
+from fastapi import Response, status
+from app.kafka_connectors import KAFKA_PROVIDERS, KAFKA_WRITE_DATA_PROVIDERS
 from app.storage_connectors import STORAGE_PROVIDERS
-from app.utils import SCHEMA_TYPE_MAP
+from app.utils import validate_schema, SCHEMA_TYPE_MAP, load_schema
 
 # A map of the required values for all supported kafka and storage providers.
 REQUIRED_VALUES_MAP = {
@@ -32,6 +34,12 @@
 }
 
 
+class DeltaTableInput(BaseModel):
+    delta_table_name: str = Field(
+        description="The name of the Delta table to read from."
+    )
+
+
 class KafkaToDeltaTableInput(BaseModel):
     """
     The model for requests to the /kafka-to-delta-table endpoint.
@@ -164,6 +172,86 @@ def require_schema_or_schema_name(cls, values):
         return values
 
 
+class DataToKafkaInput(BaseModel):
+    """
+    The model for requests to the /kafka-to-delta-table endpoint.
+    """
+
+    kafka_provider: KAFKA_WRITE_DATA_PROVIDERS = Field(
+        description="The type of kafka cluster to read from. Only local_kafka is "
+        "supported for writing data at this time."
+    )
+    kafka_data: list = Field(description="Data to be uploaded to kafka")
+    storage_provider: STORAGE_PROVIDERS = Field(
+        description="The type of storage to write to."
+    )
+    json_schema: dict = Field(
+        description=f"A schema describing the format of messages to read from Kafka. "
+        "Should be of the form { 'field_name': 'field_type' }. Field names must be "
+        "strings and supported field types include: "
+        f"{', '.join(list(SCHEMA_TYPE_MAP.keys()))}. If this is provided, then. "
+        "'schema_name' must be empty.",
+        default={},
+        alias="schema",
+    )
+    schema_name: str = Field(
+        description="The name of a schema that was previously uploaded to the service"
+        " describing the format of messages to read from Kafka. If this is provided"
+        " then 'schema' must be empty.",
+        default="",
+    )
+    kafka_server: str = Field(
+        description="The URL of a Kafka server including port. Required when "
+        "'kafka_provider' is 'local'.",
+        default="",
+    )
+    kafka_topic: str = Field(
+        description="The name of a Kafka topic to read from. Required when "
+        "'kafka_provider' is 'local'.",
+        default="",
+    )
+
+    @root_validator
+    def check_for_required_values(cls, values):
+        """
+        For a given set of values, check that all required values are present based on
+        the values of the kafka_provider and storage_provider fields.
+        """
+        missing_values = []
+
+        for provider_type in REQUIRED_VALUES_MAP:
+            provider = values.get(provider_type)
+            required_values = REQUIRED_VALUES_MAP.get(provider_type).get(provider)
+            for value in required_values:
+                if values.get(value) == "":
+                    missing_values.append(value)
+
+            if len(missing_values) > 0:
+                raise ValueError(
+                    f"When the {provider_type} is '{provider}' then the following "
+                    f"values must be specified: {', '.join(missing_values)}"
+                )
+        return values
+
+    @root_validator
+    def prohibit_schema_and_schema_name(cls, values):
+        if values.get("json_schema") != {} and values.get("schema_name") != "":
+            raise ValueError(
+                "Values for both 'schema' and 'schema_name' have been "
+                "provided. Only one of these values is permitted."
+            )
+        return values
+
+    @root_validator
+    def require_schema_or_schema_name(cls, values):
+        if values.get("json_schema") == {} and values.get("schema_name") == "":
+            raise ValueError(
+                "Values for neither 'schema' nor 'schema_name' have been "
+                "provided. One, but not both, of these values is required."
+            )
+        return values
+
+
 class KafkaToDeltaTableOutput(BaseModel):
     """
     The model for responses from the /kafka-to-delta-table endpoint.
@@ -184,7 +272,7 @@ async def kafka_to_delta_table(
     input: KafkaToDeltaTableInput, response: Response
 ) -> KafkaToDeltaTableOutput:
     """
-    Stream JSON data from Kafka to a Delta table.
+    Stream JSON data to kafka storage. Currently local kafka only.
 
     :param input: A JSON formatted request body with schema specified by the
         KafkaToDeltaTableInput model.
@@ -193,8 +281,184 @@ async def kafka_to_delta_table(
     """
     response_body = {
         "status": "success",
-        "message": "This is a stubbed response. This service is not yet implemented.",
+        "message": "",
         "spark_log": "",
     }
 
+    if input.schema_name != "":
+        schema = load_schema(input.schema_name)
+    else:
+        schema = input.json_schema
+
+    schema_validation_results = validate_schema(schema)
+
+    if not schema_validation_results["valid"]:
+        response_body["status"] = "failed"
+        response_body["message"] = schema_validation_results["errors"][0]
+        response.status_code = status.HTTP_400_BAD_REQUEST
+        return response_body
+
+    package_list = [
+        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2",
+        "io.delta:delta-core_2.12:2.3.0",
+        "org.apache.kafka:kafka-clients:3.3.2",
+        "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1",
+    ]
+
+    kafka_to_delta_command = [
+        "spark-submit",
+        "--packages",
+        ",".join(package_list),
+        "kafka_to_delta.py",
+        "--kafka_provider",
+        input.kafka_provider,
+        "--storage_provider",
+        input.storage_provider,
+        "--delta_table_name",
+        input.delta_table_name,
+        "--schema",
+        f"'{json.dumps(schema)}'",
+    ]
+
+    input = input.dict()
+    for provider_type in REQUIRED_VALUES_MAP:
+        provider = input[provider_type]
+        required_values = REQUIRED_VALUES_MAP.get(provider_type).get(provider)
+        for value in required_values:
+            kafka_to_delta_command.append(f"--{value}")
+            kafka_to_delta_command.append(input[value])
+
+    kafka_to_delta_command = " ".join(kafka_to_delta_command)
+    kafka_to_delta_result = subprocess.run(
+        kafka_to_delta_command,
+        shell=True,
+        capture_output=True,
+        text=True,
+        cwd=str(Path(__file__).parent.parent),
+    )
+
+    response_body["spark_log"] = kafka_to_delta_result.stdout
+    if kafka_to_delta_result.returncode != 0:
+        response_body["status"] = "failed"
+        response_body["spark_log"] = kafka_to_delta_result.stderr
+    return response_body
+
+
+@app.post("/load-data-to-kafka", status_code=200)
+async def data_to_kafka(
+    input: DataToKafkaInput, response: Response
+) -> KafkaToDeltaTableOutput:
+    response_body = {
+        "status": "success",
+        "message": "",
+        "spark_log": "",
+    }
+
+    if input.schema_name != "":
+        schema = load_schema(input.schema_name)
+    else:
+        schema = input.json_schema
+
+    schema_validation_results = validate_schema(schema)
+
+    if not schema_validation_results["valid"]:
+        response_body["status"] = "failed"
+        response_body["message"] = schema_validation_results["errors"][0]
+        response.status_code = status.HTTP_400_BAD_REQUEST
+        return response_body
+
+    package_list = [
+        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2",
+        "io.delta:delta-core_2.12:2.3.0",
+        "org.apache.kafka:kafka-clients:3.3.2",
+    ]
+
+    data_to_kafka_command = [
+        "spark-submit",
+        "--packages",
+        ",".join(package_list),
+        "data_to_kafka.py",
+        "--kafka_provider",
+        input.kafka_provider,
+        "--storage_provider",
+        input.storage_provider,
+        "--schema",
+        f"'{json.dumps(schema)}'",
+        "--data",
+        f"'{json.dumps(input.kafka_data)}'",
+    ]
+    input = input.dict()
+    for provider_type in REQUIRED_VALUES_MAP:
+        provider = input[provider_type]
+        required_values = REQUIRED_VALUES_MAP.get(provider_type).get(provider)
+        for value in required_values:
+            data_to_kafka_command.append(f"--{value}")
+            data_to_kafka_command.append(input[value])
+    data_to_kafka_command = " ".join(data_to_kafka_command)
+    data_to_kafka_result = subprocess.run(
+        data_to_kafka_command,
+        shell=True,
+        capture_output=True,
+        text=True,
+        cwd=str(Path(__file__).parent.parent),
+    )
+
+    response_body["spark_log"] = data_to_kafka_result.stdout
+    if data_to_kafka_result.returncode != 0:
+        response_body["status"] = "failed"
+        response_body["spark_log"] = data_to_kafka_result.stderr
+    return response_body
+
+
+@app.post("/delta-table", status_code=200)
+async def get_delta_table(
+    input: DeltaTableInput, response: Response
+) -> KafkaToDeltaTableOutput:
+    """
+    Read parquet table.
+
+    :param input: A JSON formatted request body with schema specified by the
+        DeltaTableInput model.
+    :return: A JSON formatted response body with schema specified by the
+        KafkaToDeltaTableOutput model.
+    """
+    response_body = {
+        "status": "success",
+        "message": "",
+        "spark_log": "",
+    }
+
+    package_list = [
+        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2",
+        "io.delta:delta-core_2.12:2.3.0",
+        "org.apache.kafka:kafka-clients:3.3.2",
+        "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1",
+    ]
+
+    get_delta_command = [
+        "spark-submit",
+        "--packages",
+        ",".join(package_list),
+        "get_delta.py",
+        "--delta_table_name",
+        input.delta_table_name,
+    ]
+
+    input = input.dict()
+
+    get_delta_command = " ".join(get_delta_command)
+    get_delta_result = subprocess.run(
+        get_delta_command,
+        shell=True,
+        capture_output=True,
+        text=True,
+        cwd=str(Path(__file__).parent.parent),
+    )
+
+    table = get_delta_result.stdout.split("**ParquetTable**")
+    response_body["spark_log"] = get_delta_result.stdout
+    response_body["message"] = table[1] if table[1] else ""
+    if get_delta_result.returncode != 0:
+        response_body["status"] = "failed"
+        response_body["spark_log"] = get_delta_result.stderr
     return response_body
diff --git a/containers/kafka-to-delta-table/app/storage_connectors.py b/containers/kafka-to-delta-table/app/storage_connectors.py
index 8585475fb4..176ae8975d 100644
--- a/containers/kafka-to-delta-table/app/storage_connectors.py
+++ b/containers/kafka-to-delta-table/app/storage_connectors.py
@@ -14,7 +14,7 @@ def connect_to_adlsgen2(
     client_id: str,
     client_secret_name: str,
     key_vault_name: str,
-) -> (SparkSession, str):
+) -> tuple[SparkSession, str]:
     """
     Add required configuration to a SparkSession object to allow it to connect to Azure
     Data Lake gen 2 (ADLS gen2) storage. Connection to ADLS gen2 requires an Azure App
diff --git a/containers/kafka-to-delta-table/docker-compose.yml b/containers/kafka-to-delta-table/docker-compose.yml
new file mode 100644
index 0000000000..165301ab58
--- /dev/null
+++ b/containers/kafka-to-delta-table/docker-compose.yml
@@ -0,0 +1,66 @@
+version: "3.9"
+name: "container"
+
+networks:
+  app-tier:
+    name: "app-tier"
+    driver: bridge
+
+services:
+  zookeeper:
+    image: docker.io/bitnami/zookeeper:3.8
+    ports:
+      - "2181:2181"
+    volumes:
+      - "zookeeper_data:/bitnami"
+    environment:
+      - ALLOW_ANONYMOUS_LOGIN=yes
+    networks:
+      - app-tier
+  kafka:
+    image: docker.io/bitnami/kafka:3.4
+    ports:
+      - "9092:9092"
+    volumes:
+      - "kafka_data:/bitnami"
+    environment:
+      - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181
+      - ALLOW_PLAINTEXT_LISTENER=yes
+    depends_on:
+      - zookeeper
+    networks:
+      - app-tier
+  init-kafka:
+    image: confluentinc/cp-kafka:6.1.1
+    depends_on:
+      - kafka
+    entrypoint: [ '/bin/sh', '-c' ]
+    command: |
+      "
+      # blocks until kafka is reachable
+      kafka-topics --bootstrap-server kafka:9092 --list
+
+      echo -e 'Creating kafka topics'
+      kafka-topics --bootstrap-server kafka:9092 --create --if-not-exists --topic test --replication-factor 1 --partitions 1
+      kafka-console-producer --bootstrap-server kafka:9092 --topic test
+      echo -e 'Successfully created the following topics:'
+      kafka-topics --bootstrap-server kafka:9092 --list
+      "
+    networks:
+      - app-tier
+  kafka-to-delta-table:
+    build: .
+    env_file:
+      - .env
+    ports:
+      - "8080:8080"
+    depends_on:
+      - kafka
+      - init-kafka
+    networks:
+      - app-tier
+volumes:
+  zookeeper_data:
+    driver: local
+  kafka_data:
+    driver: local
\ No newline at end of file
diff --git a/containers/kafka-to-delta-table/requirements.txt b/containers/kafka-to-delta-table/requirements.txt
index 0d0367bb3c..497b5910b7 100644
--- a/containers/kafka-to-delta-table/requirements.txt
+++ b/containers/kafka-to-delta-table/requirements.txt
@@ -4,4 +4,6 @@ argparse
 pytest
 uvicorn
 httpx
-phdi @ git+https://github.com/CDCgov/phdi.git@main
\ No newline at end of file
+phdi @ git+https://github.com/CDCgov/phdi.git@main
+py4j==0.10.9.5
+requests
diff --git a/containers/kafka-to-delta-table/tests/test_data_to_kafka.py b/containers/kafka-to-delta-table/tests/test_data_to_kafka.py
new file mode 100644
index 0000000000..32c51b169a
--- /dev/null
+++ b/containers/kafka-to-delta-table/tests/test_data_to_kafka.py
@@ -0,0 +1,91 @@
+from app.data_to_kafka import get_arguments, main, set_selection_flags
+import argparse
+from unittest import mock
+import copy
+import json
+
+
+def test_set_selection_flags():
+    arguments = [
+        "--kafka_provider",
+        "local_kafka",
+        "--storage_provider",
+        "local_storage",
+    ]
+    selection_flags = set_selection_flags(arguments)
+    assert selection_flags["local_kafka"] is True
+    assert selection_flags["azure_event_hubs"] is False
+    assert selection_flags["local_storage"] is True
+    assert selection_flags["adlsgen2"] is False
+
+
+def _check_arguments(arguments_list: list, parsed_arguments: argparse.Namespace):
+    """
+    Helper function to confirm that arguments passed to the 'get_arguments()' function
+    are parsed correctly.
+    """
+
+    for argument_index in range(0, len(arguments_list), 2):
+        argument = arguments_list[argument_index][2:]
+        value = arguments_list[argument_index + 1]
+        assert parsed_arguments.__getattribute__(argument) == value
+
+
+def test_get_arguments_missing_arguments():
+    arguments = []
+    selection_flags = set_selection_flags(arguments)
+    try:
+        get_arguments(arguments, selection_flags)
+        exit_code = 0
+    except SystemExit as error:
+        exit_code = error.code
+
+    assert exit_code == 2
+
+
+schema = json.dumps({"id": "integer", "name": "string"})
+data = json.dumps({"id": "1", "name": "foo"})
+LOCAL_KAFKA_LOCAL_STORAGE_ARGUMENTS = [
+    "--kafka_provider",
+    "local_kafka",
+    "--storage_provider",
+    "local_storage",
+    "--kafka_server",
+    "localhost:9092",
+    "--kafka_topic",
+    "test-topic",
+    "--schema",
+    schema,
+    "--data",
+    data,
+]
+
+
+def test_get_arguments_local_kafka_local_storage():
+    arguments_list = copy.deepcopy(LOCAL_KAFKA_LOCAL_STORAGE_ARGUMENTS)
+    selection_flags = set_selection_flags(arguments_list)
+    parsed_arguments = get_arguments(arguments_list, selection_flags)
+    _check_arguments(arguments_list, parsed_arguments)
+
+
+@mock.patch("app.data_to_kafka.create_kafka_data_frame")
+@mock.patch("app.data_to_kafka.struct")
+@mock.patch("app.data_to_kafka.to_json")
+@mock.patch("app.data_to_kafka.SparkSession")
+@mock.patch("app.data_to_kafka.sys")
+def test_main_local_kafka_local_storage(
+    patched_sys,
+    patched_spark_session,
+    patched_to_json,
+    patched_struct,
+    patched_create_kafka_data_frame,
+):
+    arguments_list = copy.deepcopy(LOCAL_KAFKA_LOCAL_STORAGE_ARGUMENTS)
+    arguments_list = ["data_to_kafka.py"] + arguments_list
+    patched_sys.argv = arguments_list
+    main()
+    patched_create_kafka_data_frame.assert_called_once()
+    patched_struct.assert_called_once()
+    patched_to_json.assert_called_once()
+    patched_spark_session.builder.master.assert_called_once()
+    patched_sys.exit.assert_called_once()
diff --git a/containers/kafka-to-delta-table/tests/test_kafka_connectors.py b/containers/kafka-to-delta-table/tests/test_kafka_connectors.py
index bfc34eea23..45339cf331 100644
--- a/containers/kafka-to-delta-table/tests/test_kafka_connectors.py
+++ b/containers/kafka-to-delta-table/tests/test_kafka_connectors.py
@@ -84,6 +84,7 @@ def test_connect_to_local_kafka(patched_from_json, patched_col):
     # setup values for the function parameters
     kafka_server = "some-kafka-server"
     kafka_topic = "some-kafka-topic"
+    checkpoint_path = "./some/path/"
 
     # call the function with the mock objects
     result_kafka_data_frame = connect_to_local_kafka(
@@ -91,6 +92,7 @@ def test_connect_to_local_kafka(patched_from_json, patched_col):
         schema=schema,
         kafka_server=kafka_server,
         kafka_topic=kafka_topic,
+        checkpoint_path=checkpoint_path,
     )
 
     # create the expected result
@@ -100,6 +102,7 @@ def test_connect_to_local_kafka(patched_from_json, patched_col):
         .option("failOnDataLoss", "false")
         .option("subscribe", kafka_topic)
         .option("includeHeaders", "true")
+        .option("startingOffsets", "earliest")
         .load()
         .select(
             patched_from_json(patched_col("value").cast("string"), schema).alias(
diff --git a/containers/kafka-to-delta-table/tests/test_main.py b/containers/kafka-to-delta-table/tests/test_main.py
new file mode 100644
index 0000000000..5575b58ab7
--- /dev/null
+++ b/containers/kafka-to-delta-table/tests/test_main.py
@@ -0,0 +1,76 @@
+from app.main import app
+from unittest import mock
+from fastapi.testclient import TestClient
+
+client = TestClient(app)
+
+
+def test_kafka_to_delta_invalid_schema():
+    request_body = {
+        "kafka_provider": "local_kafka",
+        "storage_provider": "local_storage",
+        "delta_table_name": "test_table",
+        "schema": {"first_name": "string", "last_name": "unknown_type"},
+        "kafka_server": "some-server",
+        "kafka_topic": "some-topic",
+    }
+
+    response = client.post("/kafka-to-delta-table", json=request_body)
+    assert response.status_code == 400
+    assert response.json() == {
+        "status": "failed",
+        "message": "Invalid type for field last_name: unknown_type. Valid types are "
+        "['string', 'integer', 'float', 'boolean', 'date', 'timestamp'].",
+        "spark_log": "",
+    }
+
+
+@mock.patch("app.main.subprocess")
+def test_kafka_to_delta_spark_failure(patched_subprocess):
+    request_body = {
+        "kafka_provider": "local_kafka",
+        "storage_provider": "local_storage",
+        "delta_table_name": "test_table",
+        "schema_name": "test_schema.json",
+        "kafka_server": "some-server",
+        "kafka_topic": "some-topic",
+    }
+
+    kafka_to_delta_result = mock.Mock()
+    kafka_to_delta_result.returncode = 1
+    kafka_to_delta_result.stdout = "Spark output"
+    kafka_to_delta_result.stderr = "Spark error"
+    patched_subprocess.run.return_value = kafka_to_delta_result
+
+    response = client.post("/kafka-to-delta-table", json=request_body)
+    assert response.status_code == 200
+    assert response.json() == {
+        "status": "failed",
+        "message": "",
+        "spark_log": "Spark error",
+    }
+
+
+@mock.patch("app.main.subprocess")
+def test_kafka_to_delta_spark_success(patched_subprocess):
+    request_body = {
+        "kafka_provider": "local_kafka",
+        "storage_provider": "local_storage",
+        "delta_table_name": "test_table",
+        "schema_name": "test_schema.json",
+        "kafka_server": "some-server",
+        "kafka_topic": "some-topic",
+    }
+
+    kafka_to_delta_result = mock.Mock()
+    kafka_to_delta_result.returncode = 0
+    kafka_to_delta_result.stdout = "Spark output"
+    patched_subprocess.run.return_value = kafka_to_delta_result
+
+    response = client.post("/kafka-to-delta-table", json=request_body)
+    assert response.status_code == 200
+    assert response.json() == {
+        "status": "success",
+        "message": "",
+        "spark_log": "Spark output",
+    }
diff --git a/containers/kafka-to-delta-table/tests/test_storage_connectors.py b/containers/kafka-to-delta-table/tests/test_storage_connectors.py
index 9f6423cd41..ebef0dde1e 100644
--- a/containers/kafka-to-delta-table/tests/test_storage_connectors.py
+++ b/containers/kafka-to-delta-table/tests/test_storage_connectors.py
@@ -37,7 +37,8 @@ def test_connect_to_adlsgen2(patched_cred_manager_class):
             "OAuth",
         ),
         mock.call(
-            f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net",
+            f"fs.azure.account.oauth.provider.type.{storage_account}"
+            + ".dfs.core.windows.net",
             "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
         ),
         mock.call(
@@ -49,7 +50,8 @@ def test_connect_to_adlsgen2(patched_cred_manager_class):
             cred_manager.get_secret(),
         ),
         mock.call(
-            f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net",
+            f"fs.azure.account.oauth2.client.endpoint.{storage_account}"
+            + ".dfs.core.windows.net",
             f"https://login.microsoftonline.com/{tenant_id}/oauth2/token",
         ),
         mock.call("fs.azure.createRemoteFileSystemDuringInitialization", "false"),

From 2ef7206e27a36955966c464db70e88b47c79b7ad Mon Sep 17 00:00:00 2001
From: Nick Clyde <nick@clyde.tech>
Date: Fri, 5 May 2023 10:52:08 -0700
Subject: [PATCH 07/12] Trigger release workflow on special keyword (#535)

* Trigger release workflow on special keyword

* Tag only, no commit
---
 .github/workflows/createNewRelease.yaml | 88 ++++++++++++++++---------
 1 file changed, 56 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/createNewRelease.yaml b/.github/workflows/createNewRelease.yaml
index 4a266b4559..bb66d32e92 100644
--- a/.github/workflows/createNewRelease.yaml
+++ b/.github/workflows/createNewRelease.yaml
@@ -1,23 +1,39 @@
 name: Create New Release
 on:
-  workflow_dispatch:
-    inputs:
-      custom_tag:
-        description: "Enter version number for release tag below. Don't forget the v! Example: v2.23.9"
-        type: string
-        required: true
+  push:
+    branches:
+      - main
 
 # Run all tests before making a release
 jobs:
-  list-containers:
-    uses: ./.github/workflows/listContainers.yaml
+  # Only make a release if commit contains [RELEASE]
+  check-commit-message:
+    runs-on: ubuntu-latest
+    outputs:
+      contains_release: ${{ steps.commit_check.outputs.contains_release }}
+    steps:
+    - name: Check out code
+      uses: actions/checkout@v3
+
+    - name: Check commit message for [RELEASE]
+      id: commit_check
+      run: |
+        if [[ "${{ github.event.head_commit.message }}" == *"[RELEASE]"* ]]; then
+          echo "contains_release=true" >> $GITHUB_OUTPUT
+        else
+          echo "contains_release=false" >> $GITHUB_OUTPUT
+        fi
   test-for-release:
+    needs: check-commit-message
+    if: ${{ needs.check-commit-message.outputs.contains_release == 'true' }}
     uses: ./.github/workflows/test.yaml
-  update-version-tag-release:
+  tag-release:
     name: Update phdi init version number
     needs: test-for-release
     permissions:
       contents: write
+    outputs:
+      version: ${{ steps.get_version.outputs.version }}
     runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v3
@@ -28,15 +44,15 @@ jobs:
         run: |
           pip install poetry
       # update the version number in the phdi/__init.py__ file
-      - name: Update PHDI Version
+      - name: Get PHDI Version
+        id: get_version
         run: |
-          poetry version ${{ github.event.inputs.custom_tag }}
-      # Create new release tag based upon user input
-      - uses: EndBug/add-and-commit@v9
+          echo "version=$(poetry version)" >> $GITHUB_OUTPUT
+      # Create new release tag
+      - name: Tag Release
+        uses: EndBug/latest-tag@latest
         with:
-          message: Create release ${{ github.event.inputs.custom_tag }}
-          add: pyproject.toml
-          tag: ${{ github.event.inputs.custom_tag }}
+          ref: ${{ steps.get_version.outputs.version }}
       # Create new release based upon the latest created tag
       - name: Create Release
         id: create_release
@@ -44,23 +60,25 @@ jobs:
         env: 
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
-          tag_name: ${{ github.event.inputs.custom_tag }}
-          release_name: Release ${{ github.event.inputs.custom_tag }}    
+          tag_name: ${{ steps.get_version.outputs.version }}
+          release_name: Release ${{ steps.get_version.outputs.version }}   
 
 # Rebuild all containers for the new release
   build-containers-for-release:
-    needs: update-version-tag-release
+    needs: tag-release
     permissions:
       contents: read
       packages: write
       id-token: write
     uses: ./.github/workflows/buildReleaseContainers.yaml
     with:
-      container-tag: ${{ github.event.inputs.custom_tag }}
+      container-tag: ${{ needs.tag-release.outputs.version }}
 
 # Create updated PHDI docs for the latest release
   generate-and-update-phdi-docs:
-    needs: build-containers-for-release
+    needs: 
+      - tag-release
+      - build-containers-for-release
     permissions:
       contents: write
     runs-on: ubuntu-latest
@@ -68,7 +86,7 @@ jobs:
       - uses: actions/checkout@v3
         with:
           fetch-depth: '0'
-          ref: ${{ github.event.inputs.custom_tag }}
+          ref: ${{ needs.tag-release.outputs.version }}
 
       - name: Install poetry and dependencies
         run: |
@@ -77,16 +95,21 @@ jobs:
 
       - name: Generate docs and move to docs branch
         run: |
-          poetry run pdoc ./phdi -o ./docs/${{ github.event.inputs.custom_tag }}/sdk
+          poetry run pdoc ./phdi -o ./docs/${{ needs.tag-release.outputs.version }}/sdk
 
       - uses: actions/upload-artifact@v3
         with:
           name: phdi-docs
-          path: ./docs/${{ github.event.inputs.custom_tag }}/sdk
+          path: ./docs/${{ needs.tag-release.outputs.version }}/sdk
 
   # Create updated container docs for the latest release
+  list-containers:
+    needs: check-commit-message
+    if: ${{ needs.check-commit-message.outputs.contains_release == 'true' }}
+    uses: ./.github/workflows/listContainers.yaml
   generate-and-update-container-docs:
     needs:
+      - tag-release
       - list-containers
       - generate-and-update-phdi-docs
     permissions:
@@ -99,7 +122,7 @@ jobs:
       - uses: actions/checkout@v3
         with:
           fetch-depth: '0'
-          ref: ${{ github.event.inputs.custom_tag }}
+          ref: ${{ needs.tag-release.outputs.version }}
 
       - name: Update Container Documenation
         run: |
@@ -109,15 +132,16 @@ jobs:
           cp $GITHUB_WORKSPACE/utils/make_openapi_json.py .
           pip install -r requirements.txt
           python make_openapi_json.py
-          redoc-cli build -o $GITHUB_WORKSPACE/docs/${{ github.event.inputs.custom_tag }}/containers/$CONTAINER.html openapi.json
+          redoc-cli build -o $GITHUB_WORKSPACE/docs/${{ needs.tag-release.outputs.version }}/containers/$CONTAINER.html openapi.json
 
       - uses: actions/upload-artifact@v3
         with:
           name: container-docs
-          path: ./docs/${{ github.event.inputs.custom_tag }}/containers
+          path: ./docs/${{ needs.tag-release.outputs.version }}/containers
 
   commit-docs:
     needs:
+      - tag-release
       - generate-and-update-phdi-docs
       - generate-and-update-container-docs
     permissions:
@@ -132,24 +156,24 @@ jobs:
         uses: actions/download-artifact@v2
         with:
           name: phdi-docs
-          path: ./docs/${{ github.event.inputs.custom_tag }}/sdk
+          path: ./docs/${{ needs.tag-release.outputs.version }}/sdk
 
       - name: Download container docs from artifacts
         uses: actions/download-artifact@v2
         with:
           name: container-docs
-          path: ./docs/${{ github.event.inputs.custom_tag }}/containers
+          path: ./docs/${{ needs.tag-release.outputs.version }}/containers
 
       - name: Copy to latest folder
         run: |
           rm -rf ./docs/latest
           mkdir -p ./docs/latest/sdk
           mkdir -p ./docs/latest/containers
-          cp -r ./docs/${{ github.event.inputs.custom_tag }}/sdk/* ./docs/latest/sdk
-          cp -r ./docs/${{ github.event.inputs.custom_tag }}/containers/* ./docs/latest/containers
+          cp -r ./docs/${{ needs.tag-release.outputs.version }}/sdk/* ./docs/latest/sdk
+          cp -r ./docs/${{ needs.tag-release.outputs.version }}/containers/* ./docs/latest/containers
 
       - name: Commit New Documentation
         uses: EndBug/add-and-commit@v9
         with:
           add: docs
-          message: Automated update of docs for ${{ github.event.inputs.custom_tag }} release.
+          message: Automated update of docs for ${{ needs.tag-release.outputs.version }} release.

From 2619ee207fd151223d0418c7d4177e6ebedf5898 Mon Sep 17 00:00:00 2001
From: KennethSkylight <107214304+KennethSkylight@users.noreply.github.com>
Date: Sat, 6 May 2023 03:33:03 +0800
Subject: [PATCH 08/12] Add step to publish to PyPI to release workflow (#515)

* add github workflow

* remove testpypi

* changed version

* adding back in test

* changing version

* Adding tagging back in

* removing

* removing test pypi

* Move PyPI release job to createNewRelease.yaml

* Fix pyproject.toml

* Add permissions

---------

Co-authored-by: Nick Clyde <nick@clyde.tech>
---
 .github/workflows/createNewRelease.yaml | 34 +++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/.github/workflows/createNewRelease.yaml b/.github/workflows/createNewRelease.yaml
index bb66d32e92..bbc8b7bd52 100644
--- a/.github/workflows/createNewRelease.yaml
+++ b/.github/workflows/createNewRelease.yaml
@@ -62,6 +62,40 @@ jobs:
         with:
           tag_name: ${{ steps.get_version.outputs.version }}
           release_name: Release ${{ steps.get_version.outputs.version }}   
+          
+  release-to-pypi:
+    name: Build and publish PHDI to PyPI
+    needs: tag-release
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: '0'
+          ref: ${{ needs.tag-release.outputs.version }}
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.x"
+      - name: Install pypa/build
+        run: >-
+          python -m
+          pip install
+          build
+          --user
+      - name: Build a binary wheel and a source tarball
+        run: >-
+          python -m
+          build
+          --sdist
+          --wheel
+          --outdir dist/
+          .
+      - name: Publish distribution 📦 to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          password: ${{ secrets.PYPI_API_TOKEN }}
 
 # Rebuild all containers for the new release
   build-containers-for-release:

From a12327bf9a4cb9bd049c7ff14df6f6aa218535cf Mon Sep 17 00:00:00 2001
From: Nick Clyde <nick@clyde.tech>
Date: Fri, 5 May 2023 12:38:28 -0700
Subject: [PATCH 09/12] [RELEASE] v0.1.0.dev5 (#536)

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 988c38ce16..4fc0f448d2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "phdi"
-version = "v0.1.0.dev1"
+version = "v0.1.0.dev5"
 description = "Public health data infrastructure Building Blocks is a library to help public health departments work with their data"
 authors = ["Kenneth Chow <kenneth@skylight.digital>", "Brandon Mader <brandon@skylight.digital>", "Spencer Kathol <spencer@skylight.digital>"]
 homepage = "https://github.com/CDCgov/phdi"

From 9a2739eb9bc3ab369df5e9580f9b08cf603aff6b Mon Sep 17 00:00:00 2001
From: Nick Clyde <nick@clyde.tech>
Date: Fri, 5 May 2023 13:01:30 -0700
Subject: [PATCH 10/12] [RELEASE] Fix release tagging (#537)

---
 .github/workflows/createNewRelease.yaml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/createNewRelease.yaml b/.github/workflows/createNewRelease.yaml
index bbc8b7bd52..99c74dbe22 100644
--- a/.github/workflows/createNewRelease.yaml
+++ b/.github/workflows/createNewRelease.yaml
@@ -49,10 +49,14 @@ jobs:
         run: |
           echo "version=$(poetry version)" >> $GITHUB_OUTPUT
       # Create new release tag
+      - name: Set up Git user
+        run: |
+          git config --local user.email "action@github.com"
+          git config --local user.name "GitHub Action"
       - name: Tag Release
-        uses: EndBug/latest-tag@latest
-        with:
-          ref: ${{ steps.get_version.outputs.version }}
+        run: |
+          git tag ${{ steps.get_version.outputs.version }}
+          git push origin ${{ steps.get_version.outputs.version }}
       # Create new release based upon the latest created tag
       - name: Create Release
         id: create_release

From 4fd35acfb39d644344aa955a3f0194a065a0764f Mon Sep 17 00:00:00 2001
From: Nick Clyde <nick@clyde.tech>
Date: Fri, 5 May 2023 13:17:20 -0700
Subject: [PATCH 11/12] [RELEASE] Fix tagging for realz (#538)

* [RELEASE] Fix release tagging

* [RELEASE] Fix tagging for realz

* wut
---
 .github/workflows/createNewRelease.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/createNewRelease.yaml b/.github/workflows/createNewRelease.yaml
index 99c74dbe22..fd7af52105 100644
--- a/.github/workflows/createNewRelease.yaml
+++ b/.github/workflows/createNewRelease.yaml
@@ -47,7 +47,8 @@ jobs:
       - name: Get PHDI Version
         id: get_version
         run: |
-          echo "version=$(poetry version)" >> $GITHUB_OUTPUT
+          VERSION_WITH_PHDI=$(poetry version)
+          echo "version=${VERSION_WITH_PHDI:5}" >> $GITHUB_OUTPUT
       # Create new release tag
       - name: Set up Git user
         run: |
@@ -65,7 +66,7 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
           tag_name: ${{ steps.get_version.outputs.version }}
-          release_name: Release ${{ steps.get_version.outputs.version }}   
+          release_name: Release ${{ steps.get_version.outputs.version }} 
           
   release-to-pypi:
     name: Build and publish PHDI to PyPI

From 6632a5329043117567d3de68d7c5ea4774cfcdbd Mon Sep 17 00:00:00 2001
From: Robert Mitchell <rmitchell@skylight.digital>
Date: Fri, 5 May 2023 15:55:11 -0700
Subject: [PATCH 12/12] Add additional param to the BaseService class to allow
 for passing other licenses to FastAPI (#526)

* added optional parameter to the BaseService class to accept either the DIBBs default Creative Commons Zero v1.0 or the MIT license from a new LicenseType class
* added a test to pass a both the DIBBs default and MIT licenses to the BaseService
---
 phdi/containers/base_service.py       | 33 ++++++++++++++++------
 tests/containers/test_base_service.py | 40 +++++++++++++++++++--------
 2 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/phdi/containers/base_service.py b/phdi/containers/base_service.py
index 451bf09de7..4577cf70d2 100644
--- a/phdi/containers/base_service.py
+++ b/phdi/containers/base_service.py
@@ -1,19 +1,38 @@
 from fastapi import FastAPI
 from pathlib import Path
+from enum import Enum
 from importlib import metadata
 
 
+# create a class with the DIBBs default Creative Commons Zero v1.0 and
+# MIT license to be used by the BaseService class
+class LicenseType(Enum):
+    CreativeCommonsZero = {
+        "name": "Creative Commons Zero v1.0 Universal",
+        "url": "https://creativecommons.org/publicdomain/zero/1.0/",
+    }
+    MIT = {"name": "The MIT License", "url": "https://mit-license.org/"}
+
+
 class BaseService:
     """
     Base class for all DIBBs services. This class provides a FastAPI instance with DIBBs
     metadata and optionally a health check endpoint.
     """
 
+    LICENSE_INFO = LicenseType.CreativeCommonsZero
+    DIBBS_CONTACT = {
+        "name": "CDC Public Health Data Infrastructure",
+        "url": "https://cdcgov.github.io/phdi-site/",
+        "email": "dmibuildingblocks@cdc.gov",
+    }
+
     def __init__(
         self,
         service_name: str,
         description_path: str,
         include_health_check_endpoint: bool = True,
+        license_info: LicenseType = LICENSE_INFO,
     ):
         """
         Initialize a BaseService instance.
@@ -23,21 +42,17 @@ def __init__(
             the service.
         :param include_health_check_endpoint: If True, the standard DIBBs health check
             endpoint will be added.
+        :param license_info: If empty, the standard DIBBs Creative Commons Zero v1.0
+            Universal license will be used. The other available option is to use the
+            MIT license.
         """
         description = Path(description_path).read_text(encoding="utf-8")
         self.include_health_check_endpoint = include_health_check_endpoint
         self.app = FastAPI(
             title=service_name,
             version=metadata.version("phdi"),
-            contact={
-                "name": "CDC Public Health Data Infrastructure",
-                "url": "https://cdcgov.github.io/phdi-site/",
-                "email": "dmibuildingblocks@cdc.gov",
-            },
-            license_info={
-                "name": "Creative Commons Zero v1.0 Universal",
-                "url": "https://creativecommons.org/publicdomain/zero/1.0/",
-            },
+            contact=self.DIBBS_CONTACT,
+            license_info=license_info,
             description=description,
         )
 
diff --git a/tests/containers/test_base_service.py b/tests/containers/test_base_service.py
index 294054d2ad..f8387ecb51 100644
--- a/tests/containers/test_base_service.py
+++ b/tests/containers/test_base_service.py
@@ -1,27 +1,43 @@
-from phdi.containers.base_service import BaseService
+from phdi.containers.base_service import LicenseType, BaseService
 from fastapi.testclient import TestClient
 from pathlib import Path
 from importlib import metadata
 
+default_app_version = metadata.version("phdi")
+default_app_contact = BaseService.DIBBS_CONTACT
+default_app_license = LicenseType.CreativeCommonsZero
+alternate_app_license = LicenseType.MIT
+
 
 def test_base_service():
     service = BaseService(
-        "test_service", Path(__file__).parent.parent / "assets" / "test_description.md"
+        service_name="test_service",
+        description_path=Path(__file__).parent.parent
+        / "assets"
+        / "test_description.md",
     )
     assert service.app.title == "test_service"
-    assert service.app.version == metadata.version("phdi")
-    assert service.app.contact == {
-        "name": "CDC Public Health Data Infrastructure",
-        "url": "https://cdcgov.github.io/phdi-site/",
-        "email": "dmibuildingblocks@cdc.gov",
-    }
-    assert service.app.license_info == {
-        "name": "Creative Commons Zero v1.0 Universal",
-        "url": "https://creativecommons.org/publicdomain/zero/1.0/",
-    }
+    assert service.app.version == default_app_version
+    assert service.app.contact == default_app_contact
+    assert service.app.license_info == default_app_license
     assert service.app.description == "This is a test description."
 
     client = TestClient(service.start())
     response = client.get("/")
     assert response.status_code == 200
     assert response.json() == {"status": "OK"}
+
+
+def test_base_service_alternate_license():
+    service = BaseService(
+        service_name="test_service",
+        description_path=Path(__file__).parent.parent
+        / "assets"
+        / "test_description.md",
+        license_info=alternate_app_license,
+    )
+    assert service.app.title == "test_service"
+    assert service.app.version == default_app_version
+    assert service.app.contact == default_app_contact
+    assert service.app.license_info == alternate_app_license
+    assert service.app.description == "This is a test description."