Skip to content

Commit

Permalink
Copy not symlink latest data files
Browse files Browse the repository at this point in the history
  • Loading branch information
lwrubel committed Aug 5, 2024
1 parent a13b6e5 commit f650e2c
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 12 deletions.
1 change: 1 addition & 0 deletions docker-compose.prod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ x-airflow-common:
AIRFLOW_VAR_SUL_PUB_HOST: ${AIRFLOW_VAR_SUL_PUB_HOST}
AIRFLOW_VAR_SUL_PUB_KEY: ${AIRFLOW_VAR_SUL_PUB_KEY}
AIRFLOW_VAR_DATA_DIR: /opt/airflow/data
AIRFLOW_VAR_PUBLISH_DIR: /opt/airflow/data/latest
volumes:
- /opt/app/rialto/rialto-airflow/current/rialto_airflow:/opt/airflow/rialto_airflow
- /data:/opt/airflow/data
Expand Down
3 changes: 1 addition & 2 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,10 @@ x-airflow-common:
AIRFLOW_VAR_SUL_PUB_KEY: ${AIRFLOW_VAR_SUL_PUB_KEY}
AIRFLOW_VAR_DEV_LIMIT: ${AIRFLOW_VAR_DEV_LIMIT}
AIRFLOW_VAR_DATA_DIR: /opt/airflow/data
AIRFLOW_VAR_PUBLISH_DIR: /opt/airflow/data/latest
AIRFLOW_VAR_OPENALEX_EMAIL: ${AIRFLOW_VAR_OPENALEX_EMAIL}
volumes:
- ${AIRFLOW_PROJ_DIR:-.}/rialto_airflow:/opt/airflow/rialto_airflow
# TODO: we may want to put logs and data outside of the project directory so
# they can persist across capistrano deploys?
- ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
- ${AIRFLOW_PROJ_DIR:-.}/data:/opt/airflow/data
user: "503:0"
Expand Down
17 changes: 7 additions & 10 deletions rialto_airflow/dags/harvest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import datetime
import pickle
from pathlib import Path
import shutil

from airflow.decorators import dag, task
from airflow.models import Variable
Expand All @@ -12,6 +13,7 @@
from rialto_airflow.utils import create_snapshot_dir, rialto_authors_file

data_dir = Variable.get("data_dir")
publish_dir = Variable.get("publish_dir")
sul_pub_host = Variable.get("sul_pub_host")
sul_pub_key = Variable.get("sul_pub_key")

Expand Down Expand Up @@ -130,18 +132,13 @@ def publish(pubs_to_contribs, merge_publications):
"""
Publish aggregate data to JupyterHub environment.
"""
contribs_path = Path(data_dir) / "latest" / "contributions.parquet"
pubs_path = Path(data_dir) / "latest" / "publications.parquet"
contribs_path = Path(publish_dir) / "contributions.parquet"
pubs_path = Path(publish_dir) / "publications.parquet"

if contribs_path.exists():
contribs_path.unlink()
if pubs_path.exists():
pubs_path.unlink()
shutil.copyfile(pubs_to_contribs, contribs_path)
shutil.copyfile(merge_publications, pubs_path)

contribs_path.symlink_to(pubs_to_contribs)
pubs_path.symlink_to(merge_publications)

return str(contribs_path), str(pubs_path)
return str(publish_dir)

snapshot_dir = setup()

Expand Down

0 comments on commit f650e2c

Please sign in to comment.