From 206325ab801c14902e8a49d9894ecdd4b317fe62 Mon Sep 17 00:00:00 2001 From: Alex Vorona Date: Fri, 26 Jan 2024 17:33:52 +0000 Subject: [PATCH 1/2] [bsc] init-from-gcs: improve --- dysnix/bsc/Chart.yaml | 2 +- .../bsc/templates/scripts/_init_from_gcs.tpl | 84 ++++++++++++++----- dysnix/bsc/values.yaml | 1 + 3 files changed, 63 insertions(+), 24 deletions(-) diff --git a/dysnix/bsc/Chart.yaml b/dysnix/bsc/Chart.yaml index 1b80df0f..122b03bc 100644 --- a/dysnix/bsc/Chart.yaml +++ b/dysnix/bsc/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: bsc description: Binance Smart Chain chart for Kubernetes -version: 0.6.38 +version: 0.6.39 appVersion: 1.2.15 keywords: diff --git a/dysnix/bsc/templates/scripts/_init_from_gcs.tpl b/dysnix/bsc/templates/scripts/_init_from_gcs.tpl index 402b20e4..0fdce0cd 100644 --- a/dysnix/bsc/templates/scripts/_init_from_gcs.tpl +++ b/dysnix/bsc/templates/scripts/_init_from_gcs.tpl @@ -7,20 +7,25 @@ set -ex # -e exits on error # AWS_SECRET_ACCESS_KEY DATA_DIR="{{ .Values.bsc.base_path }}" -CHAINDATA_DIR="${DATA_DIR}/geth/chaindata" +GETH_DIR="${DATA_DIR}/geth" +CHAINDATA_DIR="${GETH_DIR}/chaindata" +STATE_TMP_DIR="${GETH_DIR}/state_tmp" +ANCIENT_TMP_DIR="${GETH_DIR}/ancient_tmp" INITIALIZED_FILE="${DATA_DIR}/.initialized" #without gs:// or s3://, just a bucket name and path INDEX_URL="{{ .Values.bsc.initFromGCS.indexUrl }}" GCS_BASE_URL="{{ .Values.bsc.initFromGCS.baseUrlOverride }}" S5CMD=/s5cmd -EXCLUDE_ANCIENT="--exclude *.cidx --exclude *.ridx --exclude *.cdat --exclude *.rdat" -EXCLUDE_STATE="--exclude *.ldb --exclude *.sst" INDEX="index" S_UPDATING="/updating" S_TIMESTAMP="/timestamp" S_STATE_URL="/state_url" S_ANCIENT_URL="/ancient_url" S_STATS="/stats" +MAX_USED_SPACE_PERCENT={{ .Values.bsc.initFromGCS.maxUsedSpacePercent }} + +# allow container interrupt +trap "{ exit 1; }" INT TERM {{- if .Values.bsc.forceInitFromSnapshot }} rm -f "${INITIALIZED_FILE}" @@ -66,14 +71,10 @@ STATS_URL="${GCS_BASE_URL}${S_STATS}" STATE_URL="${GCS_BASE_URL}${S_STATE_URL}" ANCIENT_URL="${GCS_BASE_URL}${S_ANCIENT_URL}" - STATE_SRC="$(${S5CMD} cat s3://${STATE_URL})" ANCIENT_SRC="$(${S5CMD} cat s3://${ANCIENT_URL})" REMOTE_STATS="$(${S5CMD} cat s3://${STATS_URL})" -# create dst dirs -mkdir -p "${CHAINDATA_DIR}/ancient" - # save sync source echo "${GCS_BASE_URL}" > "${DATA_DIR}/source" @@ -97,45 +98,77 @@ TIMESTAMP_0="$(${S5CMD} cat s3://${TIMESTAMP_URL})" # we're ready to perform actual data sync -# we're done when both are true +# we're done when all are true # 1) start and stop timestamps did not changed during data sync - no process started or finished updating the cloud -# 2) 0 objects copied +# 2) start timestamp is before stop timestamp - no process is in progress updating the cloud +# 3) 0 objects copied SYNC=2 CLEANUP=1 while [ "${SYNC}" -gt 0 ] ; do # Cleanup if [ ${CLEANUP} -eq 1 ];then - echo "$(date -Iseconds) Cleaning up local dir ..." - mkdir -p ${DATA_DIR}/geth - mv ${DATA_DIR}/geth ${DATA_DIR}/geth.old && rm -rf ${DATA_DIR}/geth.old & + echo "$(date -Iseconds) Cleaning up local dir ${GETH_DIR} ..." + mkdir -p "${GETH_DIR}" + mv "${GETH_DIR}" "${GETH_DIR}.old" && rm -rf "${GETH_DIR}.old" & CLEANUP=0 fi - # sync from cloud to local disk, without removing existing [missing in the cloud] files + # sync from cloud to local disk, with removing existing [missing in the cloud] files # run multiple syncs in background - # we don't wanna sync ancient data here - time ${S5CMD} sync ${EXCLUDE_ANCIENT} s3://${STATE_SRC}/* ${CHAINDATA_DIR}/ > cplist_state.txt & + time ${S5CMD} sync --delete s3://${STATE_SRC}/* ${STATE_TMP_DIR}/ > cplist_state.txt & STATE_CP_PID=$! - time nice ${S5CMD} sync --delete --part-size 200 --concurrency 2 ${EXCLUDE_STATE} s3://${ANCIENT_SRC}/* ${CHAINDATA_DIR}/ancient/ > cplist_ancient.txt & + time nice ${S5CMD} sync --delete --part-size 200 --concurrency 2 s3://${ANCIENT_SRC}/* ${ANCIENT_TMP_DIR}/ > cplist_ancient.txt & ANCIENT_CP_PID=$! # wait for all syncs to complete - # TODO any errors handling here? - wait ${STATE_CP_PID} ${ANCIENT_CP_PID} - + # shell tracks all sub-processes and stores exit codes internally + # it's not required to stay in wait state for all background processes at the same time + # we'll handle these processes sequentially + wait ${STATE_CP_PID} + STATE_CP_EXIT_CODE=$? + wait ${ANCIENT_CP_PID} + ANCIENT_CP_EXIT_CODE=$? + + # let's handle out of disk space specially, thus we don't re-try, just stuck here if disk usage is high + VOLUME_USAGE_PERCENT=$(df "${DATA_DIR}" | tail -n 1 | awk '{print $5}'|tr -d %) + if [ "${VOLUME_USAGE_PERCENT}" -gt "${MAX_USED_SPACE_PERCENT}" ];then + set +x + # stop monitoring + if [ ${MON_PID} -ne 0 ];then kill ${MON_PID};MON_PID=0; fi + echo "We're out of disk space. Stuck here, nothing we can do. Check the source snapshot size" + echo "Source snapshot size ${REMOTE_STATS}" + echo "Disk usage is ${VOLUME_USAGE_PERCENT}%" + df -P -BG "${DATA_DIR}" + # we need to sleep inside loop to handle pod termination w/o delays + # infinite sleep loop + while true; do sleep 10;done + # never hit there + fi + # s5cmd uses 0 for success and 1 for any errors + # no errors - we're good to go + # any errors - retry the download + # all the exit codes have to be 0 + if [ "${STATE_CP_EXIT_CODE}" -ne "0" ] || [ "${ANCIENT_CP_EXIT_CODE}" -ne "0" ];then + echo "s5cmd sync returned non-zero, retrying sync after the short sleep" + # wait some time to not spam with billable requests too frequently + sleep 60 + SYNC=2 + continue + fi # get start and stop timestamps from the cloud after sync UPDATING_1="$(${S5CMD} cat s3://${UPDATING_URL})" TIMESTAMP_1="$(${S5CMD} cat s3://${TIMESTAMP_URL})" # compare timestamps before and after sync - if [ "${UPDATING_0}" -eq "${UPDATING_1}" ] && [ "${TIMESTAMP_0}" -eq "${TIMESTAMP_1}" ];then - echo "Timestamps are equal" + # ensuring start timestamp is earlier than stop timestamp + if [ "${UPDATING_0}" -eq "${UPDATING_1}" ] && [ "${TIMESTAMP_0}" -eq "${TIMESTAMP_1}" ] && [ "${TIMESTAMP_1}" -gt "${UPDATING_1}" ] ;then + echo "Timestamps did not changed and start timestamp is before stop timestamp" echo -e "U_0=${UPDATING_0}\tU_1=${UPDATING_1},\tT_0=${TIMESTAMP_0}\tT_1=${TIMESTAMP_1}" let SYNC=SYNC-1 else - echo "Timestamps changed, running sync again ..." + echo "Source timestamps changed or start timestamp is after stop timestamp, running sync again ..." echo -e "U_0=${UPDATING_0}\tU_1=${UPDATING_1},\tT_0=${TIMESTAMP_0}\tT_1=${TIMESTAMP_1}" # end timestamps -> begin timestamps UPDATING_0=${UPDATING_1} @@ -148,7 +181,7 @@ while [ "${SYNC}" -gt 0 ] ; do continue fi - # stop monitoring + # stop monitoring, we don't expect massive data copying if [ ${MON_PID} -ne 0 ];then kill ${MON_PID} MON_PID=0 @@ -168,5 +201,10 @@ while [ "${SYNC}" -gt 0 ] ; do fi done +# prepare geth datadir from tmp dirs +mv "${STATE_TMP_DIR}" "${CHAINDATA_DIR}" +rm -rf "${CHAINDATA_DIR}/ancient" +mv "${ANCIENT_TMP_DIR}" "${CHAINDATA_DIR}/ancient" + # Mark data dir as initialized touch ${INITIALIZED_FILE} diff --git a/dysnix/bsc/values.yaml b/dysnix/bsc/values.yaml index 514a5cc3..ea3bb316 100644 --- a/dysnix/bsc/values.yaml +++ b/dysnix/bsc/values.yaml @@ -148,6 +148,7 @@ bsc: indexUrl: "bucket/path/to/file" baseUrlOverride: "" # "bucket/path/to/dir" fullResyncOnSrcUpdate: false + maxUsedSpacePercent: 93 # percents syncToGCS: enabled: false image: peakcom/s5cmd:v2.2.2 From 6fdf2e97eadacd96a9da6ab1f2efed02a81df8aa Mon Sep 17 00:00:00 2001 From: Alex Vorona Date: Mon, 29 Jan 2024 20:05:53 +0000 Subject: [PATCH 2/2] [bsc] init-from-gcs: re-work out-of-space handling --- .../bsc/templates/scripts/_init_from_gcs.tpl | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/dysnix/bsc/templates/scripts/_init_from_gcs.tpl b/dysnix/bsc/templates/scripts/_init_from_gcs.tpl index 0fdce0cd..a0cf4c0b 100644 --- a/dysnix/bsc/templates/scripts/_init_from_gcs.tpl +++ b/dysnix/bsc/templates/scripts/_init_from_gcs.tpl @@ -12,6 +12,7 @@ CHAINDATA_DIR="${GETH_DIR}/chaindata" STATE_TMP_DIR="${GETH_DIR}/state_tmp" ANCIENT_TMP_DIR="${GETH_DIR}/ancient_tmp" INITIALIZED_FILE="${DATA_DIR}/.initialized" +OUT_OF_SPACE_FILE="${DATA_DIR}/.out_of_space" #without gs:// or s3://, just a bucket name and path INDEX_URL="{{ .Values.bsc.initFromGCS.indexUrl }}" GCS_BASE_URL="{{ .Values.bsc.initFromGCS.baseUrlOverride }}" @@ -28,7 +29,7 @@ MAX_USED_SPACE_PERCENT={{ .Values.bsc.initFromGCS.maxUsedSpacePercent }} trap "{ exit 1; }" INT TERM {{- if .Values.bsc.forceInitFromSnapshot }} -rm -f "${INITIALIZED_FILE}" +rm -f "${INITIALIZED_FILE}" "${OUT_OF_SPACE_FILE}" {{- end }} if [ -f "${INITIALIZED_FILE}" ]; then @@ -36,6 +37,12 @@ if [ -f "${INITIALIZED_FILE}" ]; then exit 0 fi +if [ -f "${OUT_OF_SPACE_FILE}" ]; then + echo "Seems, we're out of space. Exiting with an error ..." + cat "${OUT_OF_SPACE_FILE}" + exit 2 +fi + # we need to create temp files cd /tmp @@ -137,14 +144,12 @@ while [ "${SYNC}" -gt 0 ] ; do set +x # stop monitoring if [ ${MON_PID} -ne 0 ];then kill ${MON_PID};MON_PID=0; fi - echo "We're out of disk space. Stuck here, nothing we can do. Check the source snapshot size" - echo "Source snapshot size ${REMOTE_STATS}" - echo "Disk usage is ${VOLUME_USAGE_PERCENT}%" - df -P -BG "${DATA_DIR}" - # we need to sleep inside loop to handle pod termination w/o delays - # infinite sleep loop - while true; do sleep 10;done - # never hit there + # out of inodes error is "handled" by "set -e" + echo "We're out of disk space. Marking ${DATA_DIR} as out-of-space and exiting. Check the source snapshot size" | tee -a "${OUT_OF_SPACE_FILE}" + echo "Source snapshot size ${REMOTE_STATS}" | tee -a "${OUT_OF_SPACE_FILE}" + echo "Disk usage is ${VOLUME_USAGE_PERCENT}%" | tee -a "${OUT_OF_SPACE_FILE}" + df -P -BG "${DATA_DIR}" | tee -a "${OUT_OF_SPACE_FILE}" + exit 2 fi # s5cmd uses 0 for success and 1 for any errors # no errors - we're good to go