diff --git a/.github/workflows/pull-request.yaml b/.github/workflows/pull-request.yaml index cb48578..4fe9a3f 100644 --- a/.github/workflows/pull-request.yaml +++ b/.github/workflows/pull-request.yaml @@ -46,6 +46,9 @@ jobs: - name: K8s Tests - Run Jobs run: | + kind load docker-image kafka-producer --name github-cluster + kind load docker-image spark-jobs --name github-cluster + kind load docker-image trino-queries --name github-cluster cd scripts ./04-run-spark-jobs.sh ./05-run-trino-query.sh \ No newline at end of file diff --git a/README.md b/README.md index 1d31bf9..fb099df 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ Apache data system written for development in a local cluster and incramental de - `Kafka` event source to ingest realtime application data - `Spark` framework for microbatch and batch processes - `Delta` ACID-compliant storage layer on file storage +- `Hive` metadata store for the delta schemas - `Trino` analytics query engine for ad-hoc analysis ## System Tests @@ -19,7 +20,7 @@ An end-to-end test of the system can be run in kubernetes. The test: 3. performs windowed aggregations on the data and saves the results 4. triggers a sql analytics query through trino to simulate an analyst -The tests are triggered through github actions. +The tests are triggered through github actions, although you will need to use a self-hosted runner. ## Developer Notes diff --git a/scripts/04-run-spark-jobs.sh b/scripts/04-run-spark-jobs.sh index 28db733..2e1305f 100755 --- a/scripts/04-run-spark-jobs.sh +++ b/scripts/04-run-spark-jobs.sh @@ -20,6 +20,8 @@ while : ; do exit 1 fi if [[ $elapsed -ge $TIMEOUT ]]; then + kubectl describe sparkapplication $APP_NAME -n $NAMESPACE + kubectl logs pyspark-ingest-driver -n dev echo "Timed out waiting for the application to start." exit 1 fi @@ -51,6 +53,7 @@ while : ; do exit 1 fi if [[ $elapsed -ge $TIMEOUT ]]; then + kubectl describe sparkapplication $APP_NAME -n $NAMESPACE echo "Timed out waiting for the application to finish." exit 1 fi diff --git a/scripts/05-run-trino-query.sh b/scripts/05-run-trino-query.sh index 29b9725..2f4c9ab 100755 --- a/scripts/05-run-trino-query.sh +++ b/scripts/05-run-trino-query.sh @@ -1 +1,2 @@ -kubectl apply -f ../k8s/trino/trino-query.yaml -n dev \ No newline at end of file +kubectl apply -f ../k8s/trino/trino-query.yaml -n dev +kubectl wait --for=condition=complete --timeout=60s job/trino-query-job -n dev \ No newline at end of file diff --git a/system-design.png b/system-design.png index b5fa8e0..7ed1e77 100644 Binary files a/system-design.png and b/system-design.png differ