Skip to content

Commit

Permalink
Merge branch 'dev' into eia860-2022-final-release
Browse files Browse the repository at this point in the history
  • Loading branch information
aesharpe committed Nov 13, 2023
2 parents d84686a + fa096f2 commit 5251f84
Show file tree
Hide file tree
Showing 36 changed files with 479 additions and 163 deletions.
1 change: 1 addition & 0 deletions .github/workflows/build-deploy-pudl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ jobs:
--container-env DAGSTER_PG_HOST="104.154.182.24" \
--container-env DAGSTER_PG_DB="dagster-storage" \
--container-env PUDL_SETTINGS_YML="/home/catalyst/src/pudl/package_data/settings/etl_full.yml" \
--container-env FLY_ACCESS_TOKEN=${{ secrets.FLY_ACCESS_TOKEN }} \
# Start the VM
- name: Start the deploy-pudl-vm
Expand Down
57 changes: 57 additions & 0 deletions .github/workflows/update-lockfile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
---
name: update-lockfile

on:
workflow_dispatch:
# schedule:
# At 5:28am UTC Monday and Thursday
# - cron: 28 5 * * MON,THU

jobs:
conda-lock:
# Don't run scheduled job on forks.
if: (github.event_name == 'schedule' && github.repository == 'catalyst-cooperative/pudl') || (github.event_name != 'schedule')
defaults:
run:
# Ensure the environment is activated
# <https://github.com/mamba-org/provision-with-micromamba#important>
shell: bash -l {0}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install Micromamba
uses: mamba-org/setup-micromamba@v1
with:
environment-file: environments/conda-lock.yml
environment-name: pudl-dev

- name: Install pudl from branch
run: pip install --editable "./[dev,docs,test,datasette]"

- name: Run conda-lock to recreate lockfile from scratch
run: |
rm environments/conda-lock.yml
conda-lock \
--file=environments/dev-environment.yml \
--file=pyproject.toml \
--lockfile=environments/conda-lock.yml
- name: Open a pull request
uses: peter-evans/create-pull-request@v5
with:
# # The default GITHUB_TOKEN doesn't allow other workflows to trigger.
# # Thus if there are tests to be run, they won't be run. For more info,
# # see the note under
# # <https://github.com/peter-evans/create-pull-request#action-inputs>.
# # One possible workaround is to specify a Personal Access Token (PAT).
# # This PAT should have read-write permissions for "Pull Requests"
# # and read-write permissions for "Contents".
# token: ${{ secrets.GH_PAT_FOR_PR }}
commit-message: Update lockfile
title: Update Lockfile
body: >
This pull request relocks the dependencies with conda-lock.
It is triggered by [update-lockfile](https://github.com/catalyst-cooperative/pudl/blob/main/.github/workflows/update-lockfile.yml).
branch: update-lockfile
labels: dependencies, conda-lock
reviewers: zaneselvans
delete-branch: true
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,9 @@ notebooks/*.tgz
terraform/.terraform/*
.env
.hypothesis/

# generated by datasette/publish.py fresh for every deploy - we shouldn't track changes.
devtools/datasette/fly/Dockerfile
devtools/datasette/fly/inspect-data.json
devtools/datasette/fly/metadata.yml
devtools/datasette/fly/all_dbs.tar.zst
34 changes: 34 additions & 0 deletions devtools/datasette/fly/fly.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# fly.toml app configuration file generated for catalyst-coop-pudl on 2023-11-03T15:31:15-04:00
#
# See https://fly.io/docs/reference/configuration/ for information about how to use this file.
#
app = "catalyst-coop-pudl"
primary_region = "bos"

[[mounts]]
destination = "/data"
source = "datasette"

[[services]]
internal_port = 8080
protocol = "tcp"

[services.concurrency]
hard_limit = 25
soft_limit = 20

[[services.ports]]
handlers = ["http"]
port = 80

[[services.ports]]
handlers = ["tls", "http"]
port = 443

[[services.tcp_checks]]
grace_period = "1m"
interval = 10000
timeout = 2000

[deploy]
wait_timeout = "15m"
10 changes: 10 additions & 0 deletions devtools/datasette/fly/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#! /usr/bin/env bash
set -eux

shopt -s nullglob

find /data/ -name '*.sqlite' -delete
mv all_dbs.tar.zst /data
zstd -f -d /data/all_dbs.tar.zst -o /data/all_dbs.tar
tar -xf /data/all_dbs.tar --directory /data
datasette serve --host 0.0.0.0 /data/*.sqlite --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $PORT
122 changes: 122 additions & 0 deletions devtools/datasette/publish.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""Publish the datasette to fly.io.
We use custom logic here because the datasette-publish-fly plugin bakes the
uncompressed databases into the image, which makes the image too large.
We compress the databases before baking them into the image. Then we decompress
them at runtime to a Fly volume mounted at /data. This avoids a long download
at startup, and allows us stay within the Fly.io 8GB image size limit.
The volume handling is done manually outside of this publish.py script - it
should be terraformed at some point.
Some static fly.io deployment-related files live in ./fly:
* fly.toml - service configuration
* run.sh - service entrypoint
Apart from that: the Dockerfile and dataset-specific
metadata.yml/inspect-data.json are generated by this script.
"""

import json
import logging
import secrets
from pathlib import Path
from subprocess import check_call, check_output

from pudl.metadata.classes import DatasetteMetadata
from pudl.workspace.setup import PudlPaths

logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO)

DOCKERFILE_TEMPLATE = """
FROM python:3.11.0-slim-bullseye
COPY . /app
WORKDIR /app
RUN apt-get update
RUN apt-get install -y zstd
ENV DATASETTE_SECRET '{datasette_secret}'
RUN pip install -U datasette datasette-cluster-map datasette-vega datasette-block-robots
ENV PORT 8080
EXPOSE 8080
CMD ["./run.sh"]
"""


def make_dockerfile():
"""Write a dockerfile from template, to use in fly deploy.
We write this from template so we can generate a datasette secret. This way
we don't have to manage secrets at all.
"""
datasette_secret = secrets.token_hex(16)
return DOCKERFILE_TEMPLATE.format(datasette_secret=datasette_secret)


def inspect_data(datasets, pudl_out):
"""Pre-inspect databases to generate some metadata for Datasette.
This is done in the image build process in datasette-publish-fly, but since
we don't have access to the databases in the build process we have to
inspect before building the Docker image.
"""
inspect_output = json.loads(
check_output(
[ # noqa: S603
"datasette",
"inspect",
]
+ [str(pudl_out / ds) for ds in datasets]
)
)

for dataset in inspect_output:
name = Path(inspect_output[dataset]["file"]).name
new_filepath = Path("/data") / name
inspect_output[dataset]["file"] = str(new_filepath)
return inspect_output


def metadata(pudl_out) -> str:
"""Return human-readable metadata for Datasette."""
return DatasetteMetadata.from_data_source_ids(pudl_out).to_yaml()


def main():
"""Generate deployment files and run the deploy."""
fly_dir = Path(__file__).parent.absolute() / "fly"
docker_path = fly_dir / "Dockerfile"
inspect_path = fly_dir / "inspect-data.json"
metadata_path = fly_dir / "metadata.yml"

pudl_out = PudlPaths().pudl_output
datasets = [str(p.name) for p in pudl_out.glob("*.sqlite")]
logging.info(f"Inspecting DBs for datasette: {datasets}...")
inspect_output = inspect_data(datasets, pudl_out)
with inspect_path.open("w") as f:
f.write(json.dumps(inspect_output))

logging.info("Writing metadata...")
with metadata_path.open("w") as f:
f.write(metadata(pudl_out))

logging.info("Writing Dockerfile...")
with docker_path.open("w") as f:
f.write(make_dockerfile())

logging.info(f"Compressing {datasets} and putting into docker context...")
check_call(
["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + datasets, # noqa: S603
cwd=pudl_out,
)

logging.info("Running fly deploy...")
check_call(["/usr/bin/env", "flyctl", "deploy"], cwd=fly_dir) # noqa: S603
logging.info("Deploy finished!")


if __name__ == "__main__":
main()
26 changes: 0 additions & 26 deletions devtools/datasette/publish.sh

This file was deleted.

6 changes: 6 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
FROM condaforge/mambaforge:23.3.1-1

SHELL [ "/bin/bash", "-exo", "pipefail", "-c" ]

# Install curl and js
# awscli requires unzip, less, groff and mandoc
# hadolint ignore=DL3008
Expand All @@ -24,6 +26,10 @@ ENV CONTAINER_HOME=/home/catalyst
USER catalyst
WORKDIR ${CONTAINER_HOME}

# Install flyctl
RUN curl -L https://fly.io/install.sh | sh
ENV PATH="${CONTAINER_HOME}/.fly/bin:$PATH"

ENV CONDA_PREFIX=${CONTAINER_HOME}/env
ENV PUDL_REPO=${CONTAINER_HOME}/pudl
ENV CONDA_RUN="conda run --no-capture-output --prefix ${CONDA_PREFIX}"
Expand Down
1 change: 1 addition & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ services:
environment:
- API_KEY_EIA
- GCP_BILLING_PROJECT
- FLY_ACCESS_TOKEN
env_file:
- .env
build:
Expand Down
14 changes: 9 additions & 5 deletions docker/gcp_pudl_etl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,20 +85,24 @@ function notify_slack() {
# 2>&1 redirects stderr to stdout.
run_pudl_etl 2>&1 | tee $LOGFILE

# Notify slack if the etl succeeded.
# if pipeline is successful, distribute + publish datasette
if [[ ${PIPESTATUS[0]} == 0 ]]; then
notify_slack "success"

# Dump outputs to s3 bucket if branch is dev or build was triggered by a tag
if [ $GITHUB_ACTION_TRIGGER = "push" ] || [ $GITHUB_REF = "dev" ]; then
copy_outputs_to_distribution_bucket
fi

# Deploy the updated data to datasette
if [ $GITHUB_REF = "dev" ]; then
gcloud config set run/region us-central1
source ~/devtools/datasette/publish.sh
python ~/devtools/datasette/publish.py 2>&1 | tee -a $LOGFILE
fi
fi

# Notify slack about entire pipeline's success or failure;
# PIPESTATUS[0] either refers to the failed ETL run or the last distribution
# task that was run above
if [[ ${PIPESTATUS[0]} == 0 ]]; then
notify_slack "success"
else
notify_slack "failure"
fi
Expand Down
2 changes: 1 addition & 1 deletion notebooks/work-in-progress/CEMS_by_utility.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
"from pudl.workspace.setup import PudlPaths\n",
"\n",
"\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db())\n",
"#display(pudl_engine)\n",
Expand Down
2 changes: 1 addition & 1 deletion notebooks/work-in-progress/better-heatrates.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@
"from pudl.workspace.setup import PudlPaths\n",
"\n",
"# TODO(janrous): provide property for accessing ferc db?\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"\n",
"API_KEY_EIA = os.environ[\"API_KEY_EIA\"]\n",
Expand Down
2 changes: 1 addition & 1 deletion notebooks/work-in-progress/ferc714-output.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"display(ferc1_engine)\n",
"\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
Expand Down
2 changes: 1 addition & 1 deletion notebooks/work-in-progress/jupyterhub-test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"source": [
"from pudl.workspace.setup import PudlPaths\n",
"\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine=pudl_engine)"
]
Expand Down
2 changes: 1 addition & 1 deletion notebooks/work-in-progress/state-demand.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@
"#HARVEST_ACCOUNT_ID = os.environ[\"HARVEST_ACCOUNT_ID\"]\n",
"\n",
"from pudl.workspace.setup import PudlPaths\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
"ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
"pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
"pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine=pudl_engine)"
]
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ keywords = [
metadata_to_rst = "pudl.convert.metadata_to_rst:main"
epacems_to_parquet = "pudl.convert.epacems_to_parquet:main"
ferc_to_sqlite = "pudl.ferc_to_sqlite.cli:main"
datasette_metadata_to_yml = "pudl.convert.datasette_metadata_to_yml:main"
pudl_datastore = "pudl.workspace.datastore:main"
pudl_etl = "pudl.cli.etl:main"
pudl_setup = "pudl.workspace.setup_cli:main"
Expand Down
Loading

0 comments on commit 5251f84

Please sign in to comment.