feat(ci): WAN nightly churn #6315

Summary
Jobs
- memory-check
Run details
- Usage
- Workflow file

Workflow file for this run

.github/workflows/memcheck.yml at 38d09e1

	name: Memory Check

	on:
	# tests must run for a PR to be valid and pass merge queue muster
	# on main, we want to know that all commits are passing at a glance, any deviation should help bisecting errors
	# the merge run checks should show on master and enable this clear test/passing history
	merge_group:
	branches: [main, alpha, beta, rc*]
	pull_request:
	branches: ["*"]

	env:
	SAFE_DATA_PATH: /home/runner/.local/share/safe
	CLIENT_DATA_PATH: /home/runner/.local/share/safe/client
	NODE_DATA_PATH: /home/runner/.local/share/safe/node
	BOOTSTRAP_NODE_DATA_PATH: /home/runner/.local/share/safe/bootstrap_node
	RESTART_TEST_NODE_DATA_PATH: /home/runner/.local/share/safe/restart_node

	jobs:
	memory-check:
	runs-on: ubuntu-latest
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Install Rust
	uses: dtolnay/rust-toolchain@stable

	- uses: Swatinem/rust-cache@v2
	continue-on-error: true

	- name: install ripgrep
	shell: bash
	run: sudo apt-get install -y ripgrep

	- name: Build binaries
	run: cargo build --release --bins
	timeout-minutes: 30

	- name: Build tests
	run: cargo test --release -p sn_node --test data_with_churn --test verify_routing_table --no-run
	timeout-minutes: 30

	- name: Start a node instance that does not undergo churn
	run: \|
	mkdir -p $BOOTSTRAP_NODE_DATA_PATH
	./target/release/safenode --first \
	--root-dir $BOOTSTRAP_NODE_DATA_PATH --log-output-dest $BOOTSTRAP_NODE_DATA_PATH --local &
	sleep 10
	env:
	SN_LOG: "all"

	- name: Set SAFE_PEERS
	run: \|
	safe_peers=$(rg "listening on \".+\"" $BOOTSTRAP_NODE_DATA_PATH -u \| \
	rg '/ip4.*$' -m1 -o \| rg '"' -r '')
	echo "SAFE_PEERS=$safe_peers" >> $GITHUB_ENV

	- name: Check SAFE_PEERS was set
	shell: bash
	run: echo "The SAFE_PEERS variable has been set to $SAFE_PEERS"

	- name: Start a node instance to be restarted
	run: \|
	mkdir -p $RESTART_TEST_NODE_DATA_PATH
	./target/release/safenode \
	--root-dir $RESTART_TEST_NODE_DATA_PATH --log-output-dest $RESTART_TEST_NODE_DATA_PATH --local &
	sleep 10
	env:
	SN_LOG: "all"

	- name: Start a local network
	env:
	SN_LOG: "all"
	uses: maidsafe/sn-local-testnet-action@main
	with:
	action: start
	interval: 2000
	node-path: target/release/safenode
	faucet-path: target/release/faucet
	platform: ubuntu-latest
	set-safe-peers: false
	join: true
	build: true

	# In this case we did not want SAFE_PEERS to be set to another value by starting the testnet
	- name: Check SAFE_PEERS was not changed
	shell: bash
	run: echo "The SAFE_PEERS variable has been set to ${SAFE_PEERS}"

	- name: Create and fund a wallet to pay for files storage
	run: \|
	echo "Obtaining address for use with the faucet..."
	address=$(cargo run \
	--bin safe --release -- --log-output-dest=data-dir wallet address \| tail -n 1)
	echo "Sending tokens to the faucet at $address"
	cargo run \
	--bin faucet --release -- \
	--log-output-dest=data-dir send 5000000 $address > initial_balance_from_faucet.txt
	cat initial_balance_from_faucet.txt
	cat initial_balance_from_faucet.txt \| tail -n 1 > transfer_hex
	cat transfer_hex
	cargo run --bin safe --release -- --log-output-dest=data-dir wallet receive --file transfer_hex
	env:
	SN_LOG: "all"
	timeout-minutes: 15

	- name: Download 95mb file to be uploaded with the safe client
	shell: bash
	run: wget https://sn-node.s3.eu-west-2.amazonaws.com/the-test-data.zip

	# The resources file we upload may change, and with it mem consumption.
	# Be aware!
	- name: Start a client to upload files
	# -p makes files public
	run: \|
	ls -l
	cargo run --bin safe --release -- --log-output-dest=data-dir files upload "./the-test-data.zip" --retry-strategy quick -p
	env:
	SN_LOG: "all"
	timeout-minutes: 25

	# Uploading same file using different client shall not incur any payment neither uploads
	# Note rg will throw an error directly in case of failed to find a matching pattern.
	- name: Start a different client to upload the same file
	run: \|
	pwd
	mv $CLIENT_DATA_PATH $SAFE_DATA_PATH/client_first
	ls -l $SAFE_DATA_PATH
	ls -l $SAFE_DATA_PATH/client_first
	mkdir $SAFE_DATA_PATH/client
	ls -l $SAFE_DATA_PATH
	mv $SAFE_DATA_PATH/client_first/logs $CLIENT_DATA_PATH/logs
	ls -l $CLIENT_DATA_PATH
	cp ./the-test-data.zip ./the-test-data_1.zip
	cargo run --bin faucet --release -- --log-output-dest=data-dir send 5000000 $(cargo run --bin safe --release -- --log-output-dest=data-dir wallet address \| tail -n 1) > initial_balance_from_faucet_1.txt
	cat initial_balance_from_faucet_1.txt
	cat initial_balance_from_faucet_1.txt \| tail -n 1 > transfer_hex
	cat transfer_hex
	cargo run --bin safe --release -- --log-output-dest=data-dir wallet receive --file transfer_hex
	cargo run --bin safe --release -- --log-output-dest=data-dir files upload "./the-test-data_1.zip" --retry-strategy quick -p > second_upload.txt
	cat second_upload.txt
	rg "New wallet balance: 5000000.000000000" second_upload.txt -c --stats
	env:
	SN_LOG: "all"
	timeout-minutes: 25

	- name: Stop the restart node
	run: kill $( cat $RESTART_TEST_NODE_DATA_PATH/safenode.pid )

	- name: Start the restart node again
	run: \|
	./target/release/safenode \
	--root-dir $RESTART_TEST_NODE_DATA_PATH --log-output-dest $RESTART_TEST_NODE_DATA_PATH --local &
	sleep 10
	env:
	SN_LOG: "all"

	- name: Assert we've reloaded some chunks
	run: rg "Existing record loaded" $RESTART_TEST_NODE_DATA_PATH

	- name: Chunks data integrity during nodes churn
	run: cargo test --release -p sn_node --test data_with_churn -- --nocapture
	env:
	TEST_DURATION_MINS: 5
	TEST_TOTAL_CHURN_CYCLES: 15
	SN_LOG: "all"
	timeout-minutes: 30

	- name: Check current files
	run: ls -la
	- name: Check safenode file
	run: ls /home/runner/work/safe_network/safe_network/target/release

	- name: Check there was no restart issues
	run: \|
	if rg 'Failed to execute hard-restart command' $NODE_DATA_PATH; then
	echo "Restart issues detected"
	exit 1
	else
	echo "No restart issues detected"
	fi

	- name: Verify the routing tables of the nodes
	run: cargo test --release -p sn_node --test verify_routing_table -- --nocapture
	env:
	SLEEP_BEFORE_VERIFICATION: 300
	timeout-minutes: 10

	- name: Verify restart of nodes using rg
	shell: bash
	timeout-minutes: 1
	# get the counts, then the specific line, and then the digit count only
	# then check we have an expected level of restarts
	# TODO: make this use an env var, or relate to testnet size
	run: \|
	restart_count=$(rg "Node is restarting in" $NODE_DATA_PATH -c --stats \| \
	rg "(\d+) matches" \| rg "\d+" -o)
	echo "Restart $restart_count nodes"
	peer_removed=$(rg "PeerRemovedFromRoutingTable" $NODE_DATA_PATH -c --stats \| \
	rg "(\d+) matches" \| rg "\d+" -o)
	echo "PeerRemovedFromRoutingTable $peer_removed times"
	if [ $peer_removed -lt $restart_count ]; then
	echo "PeerRemovedFromRoutingTable times of: $peer_removed is less than the restart count of: $restart_count"
	exit 1
	fi
	node_count=$(ls $NODE_DATA_PATH \| wc -l)
	echo "Node dir count is $node_count"
	# TODO: reenable this once the testnet dir creation is tidied up to avoid a large count here
	# if [ $restart_count -lt $node_count ]; then
	# echo "Restart count of: $restart_count is less than the node count of: $node_count"
	# exit 1
	# fi

	- name: Verify data replication using rg
	shell: bash
	timeout-minutes: 1
	# get the counts, then the specific line, and then the digit count only
	# then check we have an expected level of replication
	# TODO: make this use an env var, or relate to testnet size
	# As the bootstrap_node using separate folder for logging,
	# hence the folder input to rg needs to cover that as well.
	run: \|
	sending_list_count=$(rg "Sending a replication list" $NODE_DATA_PATH -c --stats \| \
	rg "(\d+) matches" \| rg "\d+" -o)
	echo "Sent $sending_list_count replication lists"
	received_list_count=$(rg "Received replication list from" $NODE_DATA_PATH -c --stats \| \
	rg "(\d+) matches" \| rg "\d+" -o)
	echo "Received $received_list_count replication lists"
	fetching_attempt_count=$(rg "FetchingKeysForReplication" $NODE_DATA_PATH -c --stats \| \
	rg "(\d+) matches" \| rg "\d+" -o)
	echo "Carried out $fetching_attempt_count fetching attempts"
	if: always()

	- name: Start a client to download files
	run: \|
	cargo run --bin safe --release -- --log-output-dest=data-dir files download --retry-strategy quick
	ls -l $CLIENT_DATA_PATH/safe_files
	downloaded_files=$(ls $CLIENT_DATA_PATH/safe_files \| wc -l)
	if [ $downloaded_files -lt 1 ]; then
	echo "Only downloaded $downloaded_files files, less than the 1 file uploaded"
	exit 1
	fi
	env:
	SN_LOG: "all"
	timeout-minutes: 10

	# Download the same files again to ensure files won't get corrupted.
	- name: Start a client to download the same files again
	run: \|
	cargo run --bin safe --release -- --log-output-dest=data-dir files download --show-holders --retry-strategy quick
	ls -l $CLIENT_DATA_PATH/safe_files
	downloaded_files=$(ls $CLIENT_DATA_PATH/safe_files \| wc -l)
	if [ $downloaded_files -lt 1 ]; then
	echo "Only downloaded $downloaded_files files, less than the 1 file uploaded"
	exit 1
	fi
	file_size1=$(stat -c "%s" ./the-test-data_1.zip)
	file_size2=$(stat -c "%s" $CLIENT_DATA_PATH/safe_files/the-test-data_1.zip)
	if [ $file_size1 != $file_size2 ]; then
	echo "The downloaded file has a different size $file_size2 to the original $file_size1."
	exit 1
	fi
	env:
	SN_LOG: "all"
	timeout-minutes: 10

	- name: Check nodes running
	shell: bash
	timeout-minutes: 1
	continue-on-error: true
	run: pgrep safenode \| wc -l
	if: always()

	- name: Stop the local network and upload logs
	if: always()
	uses: maidsafe/sn-local-testnet-action@main
	with:
	action: stop
	log_file_prefix: safe_test_logs_memcheck
	platform: ubuntu-latest
	build: true

	- name: Check node memory usage
	shell: bash
	# The resources file and churning chunk_size we upload may change, and with it mem consumption.
	# This is set to a value high enough to allow for some variation depending on
	# resources and node location in the network, but hopefully low enough to catch
	# any wild memory issues
	# Any changes to this value should be carefully considered and tested!
	# As we have a bootstrap node acting as an access point for churning nodes and client,
	# The memory usage here will be significantly higher here than in the benchmark test,
	# where we don't have a bootstrap node.
	run: \|
	node_peak_mem_limit_mb="300" # mb

	peak_mem_usage=$(
	rg '"memory_used_mb":[^,]' $NODE_DATA_PATH//logs/* -o --no-line-number --no-filename \|
	awk -F':' '/"memory_used_mb":/{print $2}' \|
	sort -n \|
	tail -n 1
	)
	echo "Node memory usage: $peak_mem_usage MB"

	if (( $(echo "$peak_mem_usage > $node_peak_mem_limit_mb" \| bc -l) )); then
	echo "Node memory usage exceeded threshold: $peak_mem_usage MB"
	exit 1
	fi
	if: always()

	- name: Check client memory usage
	shell: bash
	# limits here are lower that benchmark tests as there is less going on.
	run: \|
	client_peak_mem_limit_mb="1024" # mb
	client_avg_mem_limit_mb="512" # mb

	peak_mem_usage=$(
	rg '"memory_used_mb":[^,]' $CLIENT_DATA_PATH/logs --glob safe. -o --no-line-number --no-filename \|
	awk -F':' '/"memory_used_mb":/{print $2}' \|
	sort -n \|
	tail -n 1
	)
	echo "Peak memory usage: $peak_mem_usage MB"
	if (( $(echo "$peak_mem_usage > $client_peak_mem_limit_mb" \| bc -l) )); then
	echo "Client peak memory usage exceeded threshold: $client_peak_mem_limit_mb MB"
	exit 1
	fi

	total_mem=$(
	rg '"memory_used_mb":[^,]' $CLIENT_DATA_PATH/logs --glob safe. -o --no-line-number --no-filename \|
	awk -F':' '/"memory_used_mb":/ {sum += $2} END {printf "%.0f\n", sum}'
	)
	num_of_times=$(
	rg "\"memory_used_mb\"" $CLIENT_DATA_PATH/logs --glob safe.* -c --stats \|
	rg "(\d+) matches" \|
	rg "\d+" -o
	)
	echo "num_of_times: $num_of_times"
	echo "Total memory is: $total_mem"
	average_mem=$(($total_mem/$(($num_of_times))))
	echo "Average memory is: $average_mem"

	if (( $(echo "$average_mem > $client_avg_mem_limit_mb" \| bc -l) )); then
	echo "Client average memory usage exceeded threshold: $client_avg_mem_limit_mb MB"
	exit 1
	fi

	- name: Check node swarm_driver handling statistics
	shell: bash
	# With the latest improvements, swarm_driver will be in high chance
	# has no super long handling (longer than 1s).
	# As the `rg` cmd will fail the shell directly if no entry find,
	# hence not covering it.
	# Be aware that if do need to looking for handlings longer than second, it shall be:
	# rg "SwarmCmd handled in [^m,µ,n]s:" $NODE_DATA_PATH//logs/* --glob safe.* -c --stats
	run: \|
	num_of_times=$(
	rg "SwarmCmd handled in [0-9.]+ms:" $NODE_DATA_PATH//logs/ --glob safe.* -c --stats \|
	rg "(\d+) matches" \|
	rg "\d+" -o
	)
	echo "Number of long cmd handling times: $num_of_times"
	total_long_handling_ms=$(
	rg "SwarmCmd handled in [0-9.]+ms:" $NODE_DATA_PATH//logs/ --glob safe.* -o --no-line-number --no-filename \|
	awk -F' \|ms:' '{sum += $4} END {printf "%.0f\n", sum}'
	)
	echo "Total cmd long handling time is: $total_long_handling_ms ms"
	average_handling_ms=$(($total_long_handling_ms/$(($num_of_times))))
	echo "Average cmd long handling time is: $average_handling_ms ms"
	total_long_handling=$(($total_long_handling_ms))
	total_num_of_times=$(($num_of_times))
	num_of_times=$(
	rg "SwarmEvent handled in [0-9.]+ms:" $NODE_DATA_PATH//logs/ --glob safe.* -c --stats \|
	rg "(\d+) matches" \|
	rg "\d+" -o
	)
	echo "Number of long event handling times: $num_of_times"
	total_long_handling_ms=$(
	rg "SwarmEvent handled in [0-9.]+ms:" $NODE_DATA_PATH//logs/ --glob safe.* -o --no-line-number --no-filename \|
	awk -F' \|ms:' '{sum += $4} END {printf "%.0f\n", sum}'
	)
	echo "Total event long handling time is: $total_long_handling_ms ms"
	average_handling_ms=$(($total_long_handling_ms/$(($num_of_times))))
	echo "Average event long handling time is: $average_handling_ms ms"
	total_long_handling=$(($total_long_handling_ms+$total_long_handling))
	total_num_of_times=$(($num_of_times+$total_num_of_times))
	average_handling_ms=$(($total_long_handling/$(($total_num_of_times))))
	echo "Total swarm_driver long handling times is: $total_num_of_times"
	echo "Total swarm_driver long handling duration is: $total_long_handling ms"
	echo "Total average swarm_driver long handling duration is: $average_handling_ms ms"

	- name: Upload payment wallet initialization log
	uses: actions/upload-artifact@main
	with:
	name: payment_wallet_initialization_log
	path: initial_balance_from_faucet.txt
	continue-on-error: true
	if: always()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat(ci): WAN nightly churn #6315

Workflow file

feat(ci): WAN nightly churn #6315

Jobs

Run details

Workflow file for this run