digital-land · cjohns-scottlogic · Nov 5, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -4,7 +4,7 @@ FROM python:3.8-slim-bookworm
 WORKDIR /
 RUN apt-get update
 RUN apt-get upgrade -y
-RUN apt-get install curl git bats -y
+RUN apt-get install curl git bats make sqlite3 -y
 
 RUN pip install awscli
 COPY requirements/requirements.txt /requirements.txt

diff --git a/task/Makefile b/task/Makefile
@@ -0,0 +1,41 @@
+all::
+
+include makerules/makerules.mk
+include makerules/datapackage.mk
+
+organisation-package:: $(PACKAGE_DIR)organisation.csv $(PACKAGE_DIR)organisation-check.csv
+
+ifneq ($(READ_S3_BUCKET),)
+save-organisation-package:: organisation-package
+	@echo Saving package to S3 bucket $(WRITE_S3_BUCKET)
+	aws s3 sync $(PACKAGE_DIR) s3://$(WRITE_S3_BUCKET)/organisation-package/$(PACKAGE_DIR)--no-progress
+else
+save-organisation-package:: organisation-package
+	@echo WRITE_S3_BUCKET not set. Package will not be saved.
+endif
+
+$(CACHE_DIR)local-planning-authority.csv:
+	@mkdir -p $(CACHE_DIR)
+	curl -qfs "https://files.planning.data.gov.uk/dataset/local-planning-authority.csv" > $(CACHE_DIR)local-planning-authority.csv
+
+ifneq ($(READ_S3_BUCKET),)
+$(PACKAGE_DIR)organisation.csv:
+	@echo Building organisation data package - using collection files from S3 bucket $(READ_S3_BUCKET)
+	@mkdir -p $(PACKAGE_DIR)
+	@mkdir -p $(CACHE_DIR)/organisation-collection/dataset 
+	@aws s3 sync s3://$(READ_S3_BUCKET)/organisation-collection/dataset $(CACHE_DIR)/organisation-collection/dataset/ --no-progress
+	digital-land organisation-create
+		--dataset-dir $(CACHE_DIR)/organisation-collection/dataset \
+		--output-path $(PACKAGE_DIR)/organisation.csv
+else
+$(PACKAGE_DIR)organisation.csv:
+	@echo Building organisation data package - using collection files from CDN
+	@mkdir -p $(PACKAGE_DIR)
+	digital-land organisation-create \
+		--cache-dir $(CACHE_DIR)/organisation-collection/dataset \
+		--download-url 'https://files.planning.data.gov.uk/organisation-collection/dataset' \
+		--output-path $(PACKAGE_DIR)/organisation.csv
+endif
+
+$(PACKAGE_DIR)organisation-check.csv: $(PACKAGE_DIR)organisation.csv $(CACHE_DIR)local-planning-authority.csv
+	digital-land organisation-check --input-path $(PACKAGE_DIR)organisation.csv --output-path $@
diff --git a/task/makerules/datapackage.mk b/task/makerules/datapackage.mk
@@ -0,0 +1,11 @@
+ifeq ($(VAR_DIR),)
+VAR_DIR=var/
+endif
+
+ifeq ($(CACHE_DIR),)
+CACHE_DIR=$(VAR_DIR)cache/
+endif
+
+ifeq ($(PACKAGE_DIR),)
+PACKAGE_DIR=package/
+endif
diff --git a/task/makerules/makerules.mk b/task/makerules/makerules.mk
@@ -0,0 +1,168 @@
+# deduce the repository
+ifeq ($(REPOSITORY),)
+REPOSITORY=$(shell basename -s .git `git config --get remote.origin.url`)
+endif
+
+ifeq ($(ENVIRONMENT),)
+ENVIRONMENT=production
+endif
+
+ifeq ($(SOURCE_URL),)
+SOURCE_URL=https://raw.githubusercontent.com/digital-land/
+endif
+
+ifeq ($(MAKERULES_URL),)
+MAKERULES_URL=$(SOURCE_URL)makerules/main/
+endif
+
+ifeq ($(CONFIG_URL),)
+CONFIG_URL=https://raw.githubusercontent.com/digital-land/config/main/
+endif
+
+ifeq ($(COLLECTION_NAME),)
+COLLECTION_NAME=$(shell echo "$(REPOSITORY)"|sed 's/-collection$$//')
+endif
+
+ifeq ($(COLLECTION_DATASET_BUCKET_NAME),)
+COLLECTION_DATASET_BUCKET_NAME=digital-land-$(ENVIRONMENT)-collection-dataset
+endif
+
+ifeq ($(HOISTED_COLLECTION_DATASET_BUCKET_NAME),)
+HOISTED_COLLECTION_DATASET_BUCKET_NAME=digital-land-$(ENVIRONMENT)-collection-dataset-hoisted
+endif
+
+define dataset_url
+'https://$(COLLECTION_DATASET_BUCKET_NAME).s3.eu-west-2.amazonaws.com/$(2)-collection/dataset/$(1).sqlite3'
+endef
+
+ifeq ($(VAR_DIR),)
+VAR_DIR=var/
+endif
+
+ifeq ($(CACHE_DIR),)
+CACHE_DIR=$(VAR_DIR)cache/
+endif
+
+
+.PHONY: \
+	makerules\
+	specification\
+	config\
+	init\
+	first-pass\
+	second-pass\
+	third-pass\
+	clobber\
+	clean\
+	commit-makerules\
+	prune
+
+# keep intermediate files
+.SECONDARY:
+
+# don't keep targets build with an error
+.DELETE_ON_ERROR:
+
+# work in UTF-8
+LANGUAGE := en_GB.UTF-8
+LANG := C.UTF-8
+
+# for consistent collation on different machines
+LC_COLLATE := C.UTF-8
+
+# current git branch
+BRANCH := $(shell git rev-parse --abbrev-ref HEAD)
+
+UNAME := $(shell uname)
+
+# detect location of spatialite library, for linux add to path so python can pick up thhe files
+ifndef SPATIALITE_EXTENSION
+ifeq ($(UNAME), Linux)
+SPATIALITE_EXTENSION="/usr/lib/x86_64-linux-gnu/mod_spatialite.so"
+endif
+ifeq ($(UNAME), Darwin)
+SPATIALITE_EXTENSION="/usr/local/lib/mod_spatialite.dylib"
+endif
+endif
+
+all:: first-pass second-pass third-pass
+
+first-pass::
+	@:
+
+# restart the make process to pick-up collected files
+second-pass::
+	@:
+
+third-pass::
+	@:
+
+# initialise
+init::
+	pip install --upgrade pip
+ifneq (,$(wildcard requirements.txt))
+	pip3 install --upgrade -r requirements.txt
+endif
+ifneq (,$(wildcard setup.py))
+	pip install -e .$(PIP_INSTALL_PACKAGE)
+endif
+	sqlite3 --version
+
+submodules::
+	git submodule update --init --recursive --remote
+
+# remove targets, force relink
+clobber::
+	@:
+
+# remove intermediate files
+clean::
+	@:
+
+# prune back to source code
+prune::
+	rm -rf ./$(VAR_DIR) $(VALIDATION_DIR)
+
+# update makerules from source
+makerules::
+	curl -qfsL '$(MAKERULES_URL)makerules.mk' > makerules/makerules.mk
+
+ifeq (,$(wildcard ./makerules/specification.mk))
+# update local copies of specification files
+specification::
+	@mkdir -p specification/
+	curl -qfsL '$(SOURCE_URL)/specification/main/specification/attribution.csv' > specification/attribution.csv
+	curl -qfsL '$(SOURCE_URL)/specification/main/specification/licence.csv' > specification/licence.csv
+	curl -qfsL '$(SOURCE_URL)/specification/main/specification/typology.csv' > specification/typology.csv
+	curl -qfsL '$(SOURCE_URL)/specification/main/specification/theme.csv' > specification/theme.csv
+	curl -qfsL '$(SOURCE_URL)/specification/main/specification/collection.csv' > specification/collection.csv
+	curl -qfsL '$(SOURCE_URL)/specification/main/specification/dataset.csv' > specification/dataset.csv
+	curl -qfsL '$(SOURCE_URL)/specification/main/specification/dataset-field.csv' > specification/dataset-field.csv
+	curl -qfsL '$(SOURCE_URL)/specification/main/specification/field.csv' > specification/field.csv
+	curl -qfsL '$(SOURCE_URL)/specification/main/specification/datatype.csv' > specification/datatype.csv
+	curl -qfsL '$(SOURCE_URL)/specification/main/specification/prefix.csv' > specification/prefix.csv
+	# deprecated ..
+	curl -qfsL '$(SOURCE_URL)/specification/main/specification/pipeline.csv' > specification/pipeline.csv
+	curl -qfsL '$(SOURCE_URL)/specification/main/specification/dataset-schema.csv' > specification/dataset-schema.csv
+	curl -qfsL '$(SOURCE_URL)/specification/main/specification/schema.csv' > specification/schema.csv
+	curl -qfsL '$(SOURCE_URL)/specification/main/specification/schema-field.csv' > specification/schema-field.csv
+
+
+init::	specification
+endif
+
+# local copy of organsiation datapackage
+$(CACHE_DIR)organisation.csv:
+	@mkdir -p $(CACHE_DIR)
+	curl -qfs "https://files.planning.data.gov.uk/organisation-collection/dataset/organisation.csv" > $(CACHE_DIR)organisation.csv
+
+init:: config
+
+config::;
+
+commit-makerules::
+	git add makerules
+	git diff --quiet && git diff --staged --quiet || (git commit -m "Updated makerules $(shell date +%F)"; git push origin $(BRANCH))
+
+commit-collection::
+	@:
diff --git a/task/run.sh b/task/run.sh
@@ -1,83 +1,14 @@
 set -e
 
-export SOURCE_URL='https://raw.githubusercontent.com/digital-land/'
-export DATASET_DIR=dataset
-export CACHE_DIR=var/cache
-
 if [ -z "$DATA_PACKAGE_NAME" ]; then
     echo DATA_PACKAGE_NAME not set
     exit 1
 fi
 
-if [ -z "$READ_S3_BUCKET" ]; then
-    echo READ_S3_BUCKET not set so files will be downloaded from the production files cdn
-fi
-
-if [ -z "$WRITE_S3_BUCKET" ]; then
-    echo WRITE_S3_BUCKET not set so files will not be uploaded to an S3 Bucket
-fi
-
-# TODO should be embedded into package creation code
-if [ "$DATA_PACKAGE_NAME" != 'organisation' ]; then
-    echo Unspoorted package.
-    exit 1
-fi
-
-# update digital-land-python
-pip install -r ./requirements.txt
-
-TODAY=$(date +%Y-%m-%d)
-echo "Running package builder for $DATA_PACKAGE_NAME on $TODAY"
+# Setup
+make makerules
+make init
 
-echo Downloading specification
-mkdir -p specification/
-curl -qfsL $SOURCE_URL/specification/main/specification/attribution.csv > specification/attribution.csv
-curl -qfsL $SOURCE_URL/specification/main/specification/licence.csv > specification/licence.csv
-curl -qfsL $SOURCE_URL/specification/main/specification/typology.csv > specification/typology.csv
-curl -qfsL $SOURCE_URL/specification/main/specification/theme.csv > specification/theme.csv
-curl -qfsL $SOURCE_URL/specification/main/specification/collection.csv > specification/collection.csv
-curl -qfsL $SOURCE_URL/specification/main/specification/dataset.csv > specification/dataset.csv
-curl -qfsL $SOURCE_URL/specification/main/specification/dataset-field.csv > specification/dataset-field.csv
-curl -qfsL $SOURCE_URL/specification/main/specification/field.csv > specification/field.csv
-curl -qfsL $SOURCE_URL/specification/main/specification/datatype.csv > specification/datatype.csv
-curl -qfsL $SOURCE_URL/specification/main/specification/prefix.csv > specification/prefix.csv
-# deprecated ..
-curl -qfsL $SOURCE_URL/specification/main/specification/pipeline.csv > specification/pipeline.csv
-curl -qfsL $SOURCE_URL/specification/main/specification/dataset-schema.csv > specification/dataset-schema.csv
-curl -qfsL $SOURCE_URL/specification/main/specification/schema.csv > specification/schema.csv
-curl -qfsL $SOURCE_URL/specification/main/specification/schema-field.csv > specification/schema-field.csv
-curl -qfsL $SOURCE_URL/specification/main/specification/datapackage.csv > specification/datapackage.csv
-curl -qfsL $SOURCE_URL/specification/main/specification/datapackage-dataset.csv > specification/datapackage-dataset.csv
-
-echo Building data package
-mkdir -p $CACHE_DIR
-
-export COLLECTION_NAME=$DATA_PACKAGE_NAME-collection
-export COLLECTION_DATASET_DIR=$CACHE_DIR/$COLLECTION_NAME/dataset/
-
-if [ -n "$READ_S3_BUCKET" ]; then
-    echo Building organisation data package - using collection files from S3 bucket $READ_S3_BUCKET
-    mkdir -p $COLLECTION_DATASET_DIR
-    aws s3 sync s3://$READ_S3_BUCKET/$COLLECTION_NAME/$DATASET_DIR $COLLECTION_DATASET_DIR --no-progress
-    digital-land organisation-create \
-        --dataset-dir $COLLECTION_DATASET_DIR \
-        --output-path $DATASET_DIR/organisation.csv
-else
-    echo Building organisation data package - using collection files from CDN
-    digital-land organisation-create \
-        --cache-dir $COLLECTION_DATASET_DIR \
-        --download-url 'https://files.planning.data.gov.uk/organisation-collection/dataset' \
-        --output-path $DATASET_DIR/organisation.csv
-fi
-
-echo Checking data package
-curl -qfs https://files.planning.data.gov.uk/dataset/local-planning-authority.csv > $CACHE_DIR/local-planning-authority.csv
-digital-land organisation-check --output-path $DATASET_DIR/organisation-check.csv
-
-ls -l $DATASET_DIR || true
-
-# TODO where to permenantly store data packages, also this uploads all the filels in datasets 
-if [ -n "$WRITE_S3_BUCKET" ]; then
-    echo Pushing package to S3 bucket $WRITE_S3_BUCKET
-    aws s3 sync $DATASET_DIR s3://$WRITE_S3_BUCKET/$DATA_PACKAGE_NAME/$DATASET_DIR --no-progress
-fi
+# Run
+make $DATA_PACKAGE_NAME-package
+make save-$DATA_PACKAGE_NAME-package