Merge pull request #279 from dysnix/bsc-improve-gcs-init

[bsc] improve init from GCS
dysnix · Jan 29, 2024 · 13533ae · 13533ae
2 parents 166509e + 6fdf2e9
commit 13533ae
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 25 deletions.
diff --git a/dysnix/bsc/Chart.yaml b/dysnix/bsc/Chart.yaml
@@ -1,7 +1,7 @@
 apiVersion: v2
 name: bsc
 description: Binance Smart Chain chart for Kubernetes
-version: 0.6.38
+version: 0.6.39
 appVersion: 1.2.15
 
 keywords:

diff --git a/dysnix/bsc/templates/scripts/_init_from_gcs.tpl b/dysnix/bsc/templates/scripts/_init_from_gcs.tpl
@@ -7,30 +7,42 @@ set -ex # -e exits on error
 # AWS_SECRET_ACCESS_KEY
 
 DATA_DIR="{{ .Values.bsc.base_path }}"
-CHAINDATA_DIR="${DATA_DIR}/geth/chaindata"
+GETH_DIR="${DATA_DIR}/geth"
+CHAINDATA_DIR="${GETH_DIR}/chaindata"
+STATE_TMP_DIR="${GETH_DIR}/state_tmp"
+ANCIENT_TMP_DIR="${GETH_DIR}/ancient_tmp"
 INITIALIZED_FILE="${DATA_DIR}/.initialized"
+OUT_OF_SPACE_FILE="${DATA_DIR}/.out_of_space"
 #without gs:// or s3://, just a bucket name and path
 INDEX_URL="{{ .Values.bsc.initFromGCS.indexUrl }}"
 GCS_BASE_URL="{{ .Values.bsc.initFromGCS.baseUrlOverride }}"
 S5CMD=/s5cmd
-EXCLUDE_ANCIENT="--exclude *.cidx --exclude *.ridx --exclude *.cdat --exclude *.rdat"
-EXCLUDE_STATE="--exclude *.ldb --exclude *.sst"
 INDEX="index"
 S_UPDATING="/updating"
 S_TIMESTAMP="/timestamp"
 S_STATE_URL="/state_url"
 S_ANCIENT_URL="/ancient_url"
 S_STATS="/stats"
+MAX_USED_SPACE_PERCENT={{ .Values.bsc.initFromGCS.maxUsedSpacePercent }}
+
+# allow container interrupt
+trap "{ exit 1; }" INT TERM
 
 {{- if .Values.bsc.forceInitFromSnapshot }}
-rm -f "${INITIALIZED_FILE}"
+rm -f "${INITIALIZED_FILE}" "${OUT_OF_SPACE_FILE}"
 {{- end }}
 
 if [ -f "${INITIALIZED_FILE}" ]; then
     echo "Blockchain already initialized. Exiting..."
     exit 0
 fi
 
+if [ -f "${OUT_OF_SPACE_FILE}" ]; then
+    echo "Seems, we're out of space. Exiting with an error ..."
+    cat "${OUT_OF_SPACE_FILE}"
+    exit 2
+fi
+
 # we need to create temp files
 cd /tmp
 
@@ -66,14 +78,10 @@ STATS_URL="${GCS_BASE_URL}${S_STATS}"
 STATE_URL="${GCS_BASE_URL}${S_STATE_URL}"
 ANCIENT_URL="${GCS_BASE_URL}${S_ANCIENT_URL}"
 
-
 STATE_SRC="$(${S5CMD} cat s3://${STATE_URL})"
 ANCIENT_SRC="$(${S5CMD} cat s3://${ANCIENT_URL})"
 REMOTE_STATS="$(${S5CMD} cat s3://${STATS_URL})"
 
-# create dst dirs
-mkdir -p "${CHAINDATA_DIR}/ancient"
-
 # save sync source
 echo "${GCS_BASE_URL}" > "${DATA_DIR}/source"
 
@@ -97,45 +105,75 @@ TIMESTAMP_0="$(${S5CMD} cat s3://${TIMESTAMP_URL})"
 
 # we're ready to perform actual data sync
 
-# we're done when both are true
+# we're done when all are true
 # 1) start and stop timestamps did not changed during data sync - no process started or finished updating the cloud
-# 2) 0 objects copied
+# 2) start timestamp is before stop timestamp - no process is in progress updating the cloud
+# 3) 0 objects copied
 SYNC=2
 CLEANUP=1
 while [ "${SYNC}" -gt 0 ] ; do
 
     # Cleanup
     if [ ${CLEANUP} -eq 1 ];then
-      echo "$(date -Iseconds) Cleaning up local dir ..."
-      mkdir -p ${DATA_DIR}/geth
-      mv ${DATA_DIR}/geth ${DATA_DIR}/geth.old && rm -rf ${DATA_DIR}/geth.old &
+      echo "$(date -Iseconds) Cleaning up local dir ${GETH_DIR} ..."
+      mkdir -p "${GETH_DIR}"
+      mv "${GETH_DIR}" "${GETH_DIR}.old" && rm -rf "${GETH_DIR}.old" &
       CLEANUP=0
     fi
 
-    # sync from cloud to local disk, without removing existing [missing in the cloud] files
+    # sync from cloud to local disk, with removing existing [missing in the cloud] files
     # run multiple syncs in background
 
-    # we don't wanna sync ancient data here
-    time ${S5CMD} sync ${EXCLUDE_ANCIENT} s3://${STATE_SRC}/* ${CHAINDATA_DIR}/ > cplist_state.txt &
+    time ${S5CMD} sync --delete s3://${STATE_SRC}/* ${STATE_TMP_DIR}/ > cplist_state.txt &
     STATE_CP_PID=$!
-    time nice ${S5CMD} sync --delete --part-size 200 --concurrency 2 ${EXCLUDE_STATE} s3://${ANCIENT_SRC}/* ${CHAINDATA_DIR}/ancient/ > cplist_ancient.txt &
+    time nice ${S5CMD} sync --delete --part-size 200 --concurrency 2 s3://${ANCIENT_SRC}/* ${ANCIENT_TMP_DIR}/ > cplist_ancient.txt &
     ANCIENT_CP_PID=$!
 
     # wait for all syncs to complete
-    # TODO any errors handling here?
-    wait ${STATE_CP_PID} ${ANCIENT_CP_PID}
-
+    # shell tracks all sub-processes and stores exit codes internally
+    # it's not required to stay in wait state for all background processes at the same time
+    # we'll handle these processes sequentially
+    wait ${STATE_CP_PID}
+    STATE_CP_EXIT_CODE=$?
+    wait ${ANCIENT_CP_PID}
+    ANCIENT_CP_EXIT_CODE=$?
+
+    # let's handle out of disk space specially, thus we don't re-try, just stuck here if disk usage is high
+    VOLUME_USAGE_PERCENT=$(df "${DATA_DIR}" | tail -n 1 | awk '{print $5}'|tr -d %)
+    if [ "${VOLUME_USAGE_PERCENT}" -gt "${MAX_USED_SPACE_PERCENT}" ];then
+      set +x
+      # stop monitoring
+      if [ ${MON_PID} -ne 0 ];then kill ${MON_PID};MON_PID=0; fi
+      # out of inodes error is "handled" by "set -e"
+      echo "We're out of disk space. Marking ${DATA_DIR} as out-of-space and exiting. Check the source snapshot size" | tee -a "${OUT_OF_SPACE_FILE}"
+      echo "Source snapshot size ${REMOTE_STATS}" | tee -a "${OUT_OF_SPACE_FILE}"
+      echo "Disk usage is ${VOLUME_USAGE_PERCENT}%" | tee -a "${OUT_OF_SPACE_FILE}"
+      df -P -BG "${DATA_DIR}" | tee -a "${OUT_OF_SPACE_FILE}"
+      exit 2
+    fi
+    # s5cmd uses 0 for success and 1 for any errors
+    # no errors - we're good to go
+    # any errors - retry the download
+    # all the exit codes have to be 0
+    if [ "${STATE_CP_EXIT_CODE}" -ne "0" ] || [ "${ANCIENT_CP_EXIT_CODE}" -ne "0" ];then
+      echo "s5cmd sync returned non-zero, retrying sync after the short sleep"
+      # wait some time to not spam with billable requests too frequently
+      sleep 60
+      SYNC=2
+      continue
+    fi
     # get start and stop timestamps from the cloud after sync
     UPDATING_1="$(${S5CMD} cat s3://${UPDATING_URL})"
     TIMESTAMP_1="$(${S5CMD} cat s3://${TIMESTAMP_URL})"
 
     # compare timestamps before and after sync
-    if [ "${UPDATING_0}" -eq "${UPDATING_1}" ] && [ "${TIMESTAMP_0}" -eq "${TIMESTAMP_1}" ];then
-      echo "Timestamps are equal"
+    # ensuring start timestamp is earlier than stop timestamp
+    if [ "${UPDATING_0}" -eq "${UPDATING_1}" ] && [ "${TIMESTAMP_0}" -eq "${TIMESTAMP_1}" ] && [ "${TIMESTAMP_1}" -gt "${UPDATING_1}" ] ;then
+      echo "Timestamps did not changed and start timestamp is before stop timestamp"
       echo -e "U_0=${UPDATING_0}\tU_1=${UPDATING_1},\tT_0=${TIMESTAMP_0}\tT_1=${TIMESTAMP_1}"
       let SYNC=SYNC-1
     else
-      echo "Timestamps changed, running sync again ..."
+      echo "Source timestamps changed or start timestamp is after stop timestamp, running sync again ..."
       echo -e "U_0=${UPDATING_0}\tU_1=${UPDATING_1},\tT_0=${TIMESTAMP_0}\tT_1=${TIMESTAMP_1}"
       # end  timestamps -> begin timestamps
       UPDATING_0=${UPDATING_1}
@@ -148,7 +186,7 @@ while [ "${SYNC}" -gt 0 ] ; do
       continue
     fi
 
-    # stop monitoring
+    # stop monitoring, we don't expect massive data copying
     if [ ${MON_PID} -ne 0 ];then
       kill ${MON_PID}
       MON_PID=0
@@ -168,5 +206,10 @@ while [ "${SYNC}" -gt 0 ] ; do
     fi
 done
 
+# prepare geth datadir from tmp dirs
+mv "${STATE_TMP_DIR}" "${CHAINDATA_DIR}"
+rm -rf "${CHAINDATA_DIR}/ancient"
+mv "${ANCIENT_TMP_DIR}" "${CHAINDATA_DIR}/ancient"
+
 # Mark data dir as initialized
 touch ${INITIALIZED_FILE}
diff --git a/dysnix/bsc/values.yaml b/dysnix/bsc/values.yaml
@@ -148,6 +148,7 @@ bsc:
     indexUrl: "bucket/path/to/file"
     baseUrlOverride: ""                   # "bucket/path/to/dir"
     fullResyncOnSrcUpdate: false
+    maxUsedSpacePercent: 93               # percents
   syncToGCS:
     enabled: false
     image: peakcom/s5cmd:v2.2.2