From 206325ab801c14902e8a49d9894ecdd4b317fe62 Mon Sep 17 00:00:00 2001
From: Alex Vorona <alex@vorona.com.ua>
Date: Fri, 26 Jan 2024 17:33:52 +0000
Subject: [PATCH 1/2] [bsc] init-from-gcs: improve

---
 dysnix/bsc/Chart.yaml                         |  2 +-
 .../bsc/templates/scripts/_init_from_gcs.tpl  | 84 ++++++++++++++-----
 dysnix/bsc/values.yaml                        |  1 +
 3 files changed, 63 insertions(+), 24 deletions(-)

diff --git a/dysnix/bsc/Chart.yaml b/dysnix/bsc/Chart.yaml
index 1b80df0f..122b03bc 100644
--- a/dysnix/bsc/Chart.yaml
+++ b/dysnix/bsc/Chart.yaml
@@ -1,7 +1,7 @@
 apiVersion: v2
 name: bsc
 description: Binance Smart Chain chart for Kubernetes
-version: 0.6.38
+version: 0.6.39
 appVersion: 1.2.15
 
 keywords:
diff --git a/dysnix/bsc/templates/scripts/_init_from_gcs.tpl b/dysnix/bsc/templates/scripts/_init_from_gcs.tpl
index 402b20e4..0fdce0cd 100644
--- a/dysnix/bsc/templates/scripts/_init_from_gcs.tpl
+++ b/dysnix/bsc/templates/scripts/_init_from_gcs.tpl
@@ -7,20 +7,25 @@ set -ex # -e exits on error
 # AWS_SECRET_ACCESS_KEY
 
 DATA_DIR="{{ .Values.bsc.base_path }}"
-CHAINDATA_DIR="${DATA_DIR}/geth/chaindata"
+GETH_DIR="${DATA_DIR}/geth"
+CHAINDATA_DIR="${GETH_DIR}/chaindata"
+STATE_TMP_DIR="${GETH_DIR}/state_tmp"
+ANCIENT_TMP_DIR="${GETH_DIR}/ancient_tmp"
 INITIALIZED_FILE="${DATA_DIR}/.initialized"
 #without gs:// or s3://, just a bucket name and path
 INDEX_URL="{{ .Values.bsc.initFromGCS.indexUrl }}"
 GCS_BASE_URL="{{ .Values.bsc.initFromGCS.baseUrlOverride }}"
 S5CMD=/s5cmd
-EXCLUDE_ANCIENT="--exclude *.cidx --exclude *.ridx --exclude *.cdat --exclude *.rdat"
-EXCLUDE_STATE="--exclude *.ldb --exclude *.sst"
 INDEX="index"
 S_UPDATING="/updating"
 S_TIMESTAMP="/timestamp"
 S_STATE_URL="/state_url"
 S_ANCIENT_URL="/ancient_url"
 S_STATS="/stats"
+MAX_USED_SPACE_PERCENT={{ .Values.bsc.initFromGCS.maxUsedSpacePercent }}
+
+# allow container interrupt
+trap "{ exit 1; }" INT TERM
 
 {{- if .Values.bsc.forceInitFromSnapshot }}
 rm -f "${INITIALIZED_FILE}"
@@ -66,14 +71,10 @@ STATS_URL="${GCS_BASE_URL}${S_STATS}"
 STATE_URL="${GCS_BASE_URL}${S_STATE_URL}"
 ANCIENT_URL="${GCS_BASE_URL}${S_ANCIENT_URL}"
 
-
 STATE_SRC="$(${S5CMD} cat s3://${STATE_URL})"
 ANCIENT_SRC="$(${S5CMD} cat s3://${ANCIENT_URL})"
 REMOTE_STATS="$(${S5CMD} cat s3://${STATS_URL})"
 
-# create dst dirs
-mkdir -p "${CHAINDATA_DIR}/ancient"
-
 # save sync source
 echo "${GCS_BASE_URL}" > "${DATA_DIR}/source"
 
@@ -97,45 +98,77 @@ TIMESTAMP_0="$(${S5CMD} cat s3://${TIMESTAMP_URL})"
 
 # we're ready to perform actual data sync
 
-# we're done when both are true
+# we're done when all are true
 # 1) start and stop timestamps did not changed during data sync - no process started or finished updating the cloud
-# 2) 0 objects copied
+# 2) start timestamp is before stop timestamp - no process is in progress updating the cloud
+# 3) 0 objects copied
 SYNC=2
 CLEANUP=1
 while [ "${SYNC}" -gt 0 ] ; do
 
     # Cleanup
     if [ ${CLEANUP} -eq 1 ];then
-      echo "$(date -Iseconds) Cleaning up local dir ..."
-      mkdir -p ${DATA_DIR}/geth
-      mv ${DATA_DIR}/geth ${DATA_DIR}/geth.old && rm -rf ${DATA_DIR}/geth.old &
+      echo "$(date -Iseconds) Cleaning up local dir ${GETH_DIR} ..."
+      mkdir -p "${GETH_DIR}"
+      mv "${GETH_DIR}" "${GETH_DIR}.old" && rm -rf "${GETH_DIR}.old" &
       CLEANUP=0
     fi
 
-    # sync from cloud to local disk, without removing existing [missing in the cloud] files
+    # sync from cloud to local disk, with removing existing [missing in the cloud] files
     # run multiple syncs in background
 
-    # we don't wanna sync ancient data here
-    time ${S5CMD} sync ${EXCLUDE_ANCIENT} s3://${STATE_SRC}/* ${CHAINDATA_DIR}/ > cplist_state.txt &
+    time ${S5CMD} sync --delete s3://${STATE_SRC}/* ${STATE_TMP_DIR}/ > cplist_state.txt &
     STATE_CP_PID=$!
-    time nice ${S5CMD} sync --delete --part-size 200 --concurrency 2 ${EXCLUDE_STATE} s3://${ANCIENT_SRC}/* ${CHAINDATA_DIR}/ancient/ > cplist_ancient.txt &
+    time nice ${S5CMD} sync --delete --part-size 200 --concurrency 2 s3://${ANCIENT_SRC}/* ${ANCIENT_TMP_DIR}/ > cplist_ancient.txt &
     ANCIENT_CP_PID=$!
 
     # wait for all syncs to complete
-    # TODO any errors handling here?
-    wait ${STATE_CP_PID} ${ANCIENT_CP_PID}
-
+    # shell tracks all sub-processes and stores exit codes internally
+    # it's not required to stay in wait state for all background processes at the same time
+    # we'll handle these processes sequentially
+    wait ${STATE_CP_PID}
+    STATE_CP_EXIT_CODE=$?
+    wait ${ANCIENT_CP_PID}
+    ANCIENT_CP_EXIT_CODE=$?
+
+    # let's handle out of disk space specially, thus we don't re-try, just stuck here if disk usage is high
+    VOLUME_USAGE_PERCENT=$(df "${DATA_DIR}" | tail -n 1 | awk '{print $5}'|tr -d %)
+    if [ "${VOLUME_USAGE_PERCENT}" -gt "${MAX_USED_SPACE_PERCENT}" ];then
+      set +x
+      # stop monitoring
+      if [ ${MON_PID} -ne 0 ];then kill ${MON_PID};MON_PID=0; fi
+      echo "We're out of disk space. Stuck here, nothing we can do. Check the source snapshot size"
+      echo "Source snapshot size ${REMOTE_STATS}"
+      echo "Disk usage is ${VOLUME_USAGE_PERCENT}%"
+      df -P -BG "${DATA_DIR}"
+      # we need to sleep <short-delay> inside loop to handle pod termination w/o delays
+      # infinite sleep loop
+      while true; do sleep 10;done
+      # never hit there
+    fi
+    # s5cmd uses 0 for success and 1 for any errors
+    # no errors - we're good to go
+    # any errors - retry the download
+    # all the exit codes have to be 0
+    if [ "${STATE_CP_EXIT_CODE}" -ne "0" ] || [ "${ANCIENT_CP_EXIT_CODE}" -ne "0" ];then
+      echo "s5cmd sync returned non-zero, retrying sync after the short sleep"
+      # wait some time to not spam with billable requests too frequently
+      sleep 60
+      SYNC=2
+      continue
+    fi
     # get start and stop timestamps from the cloud after sync
     UPDATING_1="$(${S5CMD} cat s3://${UPDATING_URL})"
     TIMESTAMP_1="$(${S5CMD} cat s3://${TIMESTAMP_URL})"
 
     # compare timestamps before and after sync
-    if [ "${UPDATING_0}" -eq "${UPDATING_1}" ] && [ "${TIMESTAMP_0}" -eq "${TIMESTAMP_1}" ];then
-      echo "Timestamps are equal"
+    # ensuring start timestamp is earlier than stop timestamp
+    if [ "${UPDATING_0}" -eq "${UPDATING_1}" ] && [ "${TIMESTAMP_0}" -eq "${TIMESTAMP_1}" ] && [ "${TIMESTAMP_1}" -gt "${UPDATING_1}" ] ;then
+      echo "Timestamps did not changed and start timestamp is before stop timestamp"
       echo -e "U_0=${UPDATING_0}\tU_1=${UPDATING_1},\tT_0=${TIMESTAMP_0}\tT_1=${TIMESTAMP_1}"
       let SYNC=SYNC-1
     else
-      echo "Timestamps changed, running sync again ..."
+      echo "Source timestamps changed or start timestamp is after stop timestamp, running sync again ..."
       echo -e "U_0=${UPDATING_0}\tU_1=${UPDATING_1},\tT_0=${TIMESTAMP_0}\tT_1=${TIMESTAMP_1}"
       # end  timestamps -> begin timestamps
       UPDATING_0=${UPDATING_1}
@@ -148,7 +181,7 @@ while [ "${SYNC}" -gt 0 ] ; do
       continue
     fi
 
-    # stop monitoring
+    # stop monitoring, we don't expect massive data copying
     if [ ${MON_PID} -ne 0 ];then
       kill ${MON_PID}
       MON_PID=0
@@ -168,5 +201,10 @@ while [ "${SYNC}" -gt 0 ] ; do
     fi
 done
 
+# prepare geth datadir from tmp dirs
+mv "${STATE_TMP_DIR}" "${CHAINDATA_DIR}"
+rm -rf "${CHAINDATA_DIR}/ancient"
+mv "${ANCIENT_TMP_DIR}" "${CHAINDATA_DIR}/ancient"
+
 # Mark data dir as initialized
 touch ${INITIALIZED_FILE}
diff --git a/dysnix/bsc/values.yaml b/dysnix/bsc/values.yaml
index 514a5cc3..ea3bb316 100644
--- a/dysnix/bsc/values.yaml
+++ b/dysnix/bsc/values.yaml
@@ -148,6 +148,7 @@ bsc:
     indexUrl: "bucket/path/to/file"
     baseUrlOverride: ""                   # "bucket/path/to/dir"
     fullResyncOnSrcUpdate: false
+    maxUsedSpacePercent: 93               # percents
   syncToGCS:
     enabled: false
     image: peakcom/s5cmd:v2.2.2

From 6fdf2e97eadacd96a9da6ab1f2efed02a81df8aa Mon Sep 17 00:00:00 2001
From: Alex Vorona <alex@vorona.com.ua>
Date: Mon, 29 Jan 2024 20:05:53 +0000
Subject: [PATCH 2/2] [bsc] init-from-gcs: re-work out-of-space handling

---
 .../bsc/templates/scripts/_init_from_gcs.tpl  | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/dysnix/bsc/templates/scripts/_init_from_gcs.tpl b/dysnix/bsc/templates/scripts/_init_from_gcs.tpl
index 0fdce0cd..a0cf4c0b 100644
--- a/dysnix/bsc/templates/scripts/_init_from_gcs.tpl
+++ b/dysnix/bsc/templates/scripts/_init_from_gcs.tpl
@@ -12,6 +12,7 @@ CHAINDATA_DIR="${GETH_DIR}/chaindata"
 STATE_TMP_DIR="${GETH_DIR}/state_tmp"
 ANCIENT_TMP_DIR="${GETH_DIR}/ancient_tmp"
 INITIALIZED_FILE="${DATA_DIR}/.initialized"
+OUT_OF_SPACE_FILE="${DATA_DIR}/.out_of_space"
 #without gs:// or s3://, just a bucket name and path
 INDEX_URL="{{ .Values.bsc.initFromGCS.indexUrl }}"
 GCS_BASE_URL="{{ .Values.bsc.initFromGCS.baseUrlOverride }}"
@@ -28,7 +29,7 @@ MAX_USED_SPACE_PERCENT={{ .Values.bsc.initFromGCS.maxUsedSpacePercent }}
 trap "{ exit 1; }" INT TERM
 
 {{- if .Values.bsc.forceInitFromSnapshot }}
-rm -f "${INITIALIZED_FILE}"
+rm -f "${INITIALIZED_FILE}" "${OUT_OF_SPACE_FILE}"
 {{- end }}
 
 if [ -f "${INITIALIZED_FILE}" ]; then
@@ -36,6 +37,12 @@ if [ -f "${INITIALIZED_FILE}" ]; then
     exit 0
 fi
 
+if [ -f "${OUT_OF_SPACE_FILE}" ]; then
+    echo "Seems, we're out of space. Exiting with an error ..."
+    cat "${OUT_OF_SPACE_FILE}"
+    exit 2
+fi
+
 # we need to create temp files
 cd /tmp
 
@@ -137,14 +144,12 @@ while [ "${SYNC}" -gt 0 ] ; do
       set +x
       # stop monitoring
       if [ ${MON_PID} -ne 0 ];then kill ${MON_PID};MON_PID=0; fi
-      echo "We're out of disk space. Stuck here, nothing we can do. Check the source snapshot size"
-      echo "Source snapshot size ${REMOTE_STATS}"
-      echo "Disk usage is ${VOLUME_USAGE_PERCENT}%"
-      df -P -BG "${DATA_DIR}"
-      # we need to sleep <short-delay> inside loop to handle pod termination w/o delays
-      # infinite sleep loop
-      while true; do sleep 10;done
-      # never hit there
+      # out of inodes error is "handled" by "set -e"
+      echo "We're out of disk space. Marking ${DATA_DIR} as out-of-space and exiting. Check the source snapshot size" | tee -a "${OUT_OF_SPACE_FILE}"
+      echo "Source snapshot size ${REMOTE_STATS}" | tee -a "${OUT_OF_SPACE_FILE}"
+      echo "Disk usage is ${VOLUME_USAGE_PERCENT}%" | tee -a "${OUT_OF_SPACE_FILE}"
+      df -P -BG "${DATA_DIR}" | tee -a "${OUT_OF_SPACE_FILE}"
+      exit 2
     fi
     # s5cmd uses 0 for success and 1 for any errors
     # no errors - we're good to go