Skip to content

Commit

Permalink
Merge pull request #98 from FalkorDB/92-cluster-make-sure-cluster-sta…
Browse files Browse the repository at this point in the history
…ys-online-during-all-tests

92 cluster make sure cluster stays online during all tests
  • Loading branch information
dudizimber authored Sep 19, 2024
2 parents d5200b9 + cc53be7 commit 7e1d935
Show file tree
Hide file tree
Showing 14 changed files with 354 additions and 72 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/build-test-image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ jobs:
subscriptionId: sub-GJPV3NoNC0
serviceId: ${{ vars.OMNISTRATE_INTERNAL_SERVICE_ID }}
environmentId: ${{ vars.OMNISTRATE_INTERNAL_DEV_ENVIRONMENT}}
extraParams: "--resource-key 'cluster-Single-Zone' --replica-id 'cluster-sz-0' --instance-name 'test-cluster-sz-failover' --instance-description 'test-cluster-sz-failover' --instance-type 'e2-medium' --storage-size '30' --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1'"
extraParams: "--resource-key 'cluster-Single-Zone' --replica-id 'cluster-sz-1' --instance-name 'test-cluster-sz-failover' --instance-description 'test-cluster-sz-failover' --instance-type 'e2-medium' --storage-size '30' --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1'"
- name: PRO/ClusterMultiZone - GCP/us-central1 - Failover & Persistence
if: "true"
testFile: test_cluster.py
Expand All @@ -268,7 +268,7 @@ jobs:
subscriptionId: sub-GJPV3NoNC0
serviceId: ${{ vars.OMNISTRATE_INTERNAL_SERVICE_ID }}
environmentId: ${{ vars.OMNISTRATE_INTERNAL_DEV_ENVIRONMENT}}
extraParams: "--resource-key 'cluster-Multi-Zone' --replica-id 'cluster-mz-0' --instance-name 'test-cluster-mz-failover' --instance-description 'test-cluster-mz-failover' --instance-type 'e2-medium' --storage-size '30' --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1' --ensure-mz-distribution"
extraParams: "--resource-key 'cluster-Multi-Zone' --replica-id 'cluster-mz-1' --instance-name 'test-cluster-mz-failover' --instance-description 'test-cluster-mz-failover' --instance-type 'e2-medium' --storage-size '30' --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1' --ensure-mz-distribution"
- name: PRO/ClusterSingleZone - GCP/us-central1 - Failover & Persistence With TLS
if: "true"
testFile: test_cluster.py
Expand All @@ -278,7 +278,7 @@ jobs:
subscriptionId: sub-GJPV3NoNC0
serviceId: ${{ vars.OMNISTRATE_INTERNAL_SERVICE_ID }}
environmentId: ${{ vars.OMNISTRATE_INTERNAL_DEV_ENVIRONMENT}}
extraParams: "--resource-key 'cluster-Single-Zone' --replica-id 'cluster-sz-0' --instance-name 'test-cluster-sz-failover-tls' --instance-description 'test-cluster-sz-failover' --instance-type 'e2-medium' --storage-size '30' --tls --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1'"
extraParams: "--resource-key 'cluster-Single-Zone' --replica-id 'cluster-sz-1' --instance-name 'test-cluster-sz-failover-tls' --instance-description 'test-cluster-sz-failover' --instance-type 'e2-medium' --storage-size '30' --tls --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1'"
- name: PRO/ClusterMultiZone - GCP/us-central1 - Failover & Persistence With TLS
if: "true"
testFile: test_cluster.py
Expand All @@ -288,7 +288,7 @@ jobs:
subscriptionId: sub-GJPV3NoNC0
serviceId: ${{ vars.OMNISTRATE_INTERNAL_SERVICE_ID }}
environmentId: ${{ vars.OMNISTRATE_INTERNAL_DEV_ENVIRONMENT}}
extraParams: "--resource-key 'cluster-Multi-Zone' --replica-id 'cluster-mz-0' --instance-name 'test-cluster-mz-failover-tls' --instance-description 'test-cluster-mz-failover' --instance-type 'e2-medium' --storage-size '30' --tls --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1' --ensure-mz-distribution"
extraParams: "--resource-key 'cluster-Multi-Zone' --replica-id 'cluster-mz-1' --instance-name 'test-cluster-mz-failover-tls' --instance-description 'test-cluster-mz-failover' --instance-type 'e2-medium' --storage-size '30' --tls --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1' --ensure-mz-distribution"
# - name: PRO/ClusterSingleZone - GCP/us-central1 - Add/Remove Shards
# if: "true"
# testFile: test_cluster_shards.py
Expand Down
15 changes: 11 additions & 4 deletions falkordb-cluster/cluster-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@

FALKORDB_USER=${FALKORDB_USER:-falkordb}
#FALKORDB_PASSWORD=${FALKORDB_PASSWORD:-''}
if [[ -f "/run/secrets/falkordbpassword" ]] && [[ -s "/run/secrets/falkordbpassword" ]];then
if [[ -f "/run/secrets/falkordbpassword" ]] && [[ -s "/run/secrets/falkordbpassword" ]]; then
FALKORDB_PASSWORD=$(cat "/run/secrets/falkordbpassword")
elif [[ -n "$FALKORDB_PASSWORD" ]];then
elif [[ -n "$FALKORDB_PASSWORD" ]]; then
FALKORDB_PASSWORD=$FALKORDB_PASSWORD
else
FALKORDB_PASSWORD=''
fi

#ADMIN_PASSWORD=${ADMIN_PASSWORD:-''}
if [[ -f "/run/secrets/adminpassword" ]] && [[ -s "/run/secrets/adminpassword" ]];then
if [[ -f "/run/secrets/adminpassword" ]] && [[ -s "/run/secrets/adminpassword" ]]; then
ADMIN_PASSWORD=$(cat "/run/secrets/adminpassword")
export ADMIN_PASSWORD
elif [[ -n "$ADMIN_PASSWORD" ]];then
elif [[ -n "$ADMIN_PASSWORD" ]]; then
export ADMIN_PASSWORD=$ADMIN_PASSWORD
else
export ADMIN_PASSWORD=''
Expand Down Expand Up @@ -166,6 +166,11 @@ set_aof_persistence_config() {
fi
}

config_rewrite() {
echo "Rewriting configuration"
redis-cli -p $NODE_PORT $AUTH_CONNECTION_STRING $TLS_CONNECTION_STRING CONFIG REWRITE
}

create_cluster() {

local urls=""
Expand Down Expand Up @@ -256,6 +261,8 @@ set_memory_limit
set_rdb_persistence_config
set_aof_persistence_config

config_rewrite

if [[ $NODE_INDEX -eq 0 && ! -f "/data/cluster_initialized" ]]; then
# Create cluster
echo "Creating cluster"
Expand Down
20 changes: 1 addition & 19 deletions falkordb-node/node-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ remove_master_from_group() {
if [[ $IS_REPLICA -eq 0 && $RUN_SENTINEL -eq 1 ]]; then
echo "Removing master from sentinel"
redis-cli -p $SENTINEL_PORT -a $ADMIN_PASSWORD --no-auth-warning $TLS_CONNECTION_STRING SENTINEL failover $MASTER_NAME
sleep 5
tries=5
while true; do
master_info=$(redis-cli -a $ADMIN_PASSWORD --no-auth-warning $TLS_CONNECTION_STRING info replication | grep role)
Expand Down Expand Up @@ -123,23 +124,6 @@ get_sentinels_list() {
return $sentinels_list
}

send_reset_to_sentinels() {
echo "Sending reset to sentinels"

sentinels_list=$1
i=0
for sentinel in $sentinels_list; do
# Wait 30 seconds for the reset to take effect before sending the next reset
if [[ $i -gt 0 ]]; then
sleep 30
fi
sentinel_ip=$(echo ${sentinel//:/ } | awk '{print $1}')
sentinel_port=$(echo ${sentinel//:/ } | awk '{print $2}')
echo "Sending reset to $sentinel_ip:$sentinel_port"
redis-cli -h $sentinel_ip -p $sentinel_port -a $ADMIN_PASSWORD --no-auth-warning $TLS_CONNECTION_STRING SENTINEL reset $MASTER_NAME
done
}

# Handle signals

handle_sigterm() {
Expand All @@ -165,8 +149,6 @@ handle_sigterm() {
wait $sentinel_pid
fi

# send_reset_to_sentinels $sentinels_list

if [[ $RUN_METRICS -eq 1 && ! -z $redis_exporter_pid ]]; then
kill -TERM $redis_exporter_pid
fi
Expand Down
7 changes: 6 additions & 1 deletion healthcheck_rs/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,12 @@ fn start_health_check_server() {
fn health_check_handler() -> Result<bool, redis::RedisError> {
let password = match env::var("ADMIN_PASSWORD") {
Ok(password) => password,
Err(_) => "".to_string(),
Err(_) => {
// Read from /run/secrets/adminpassword
let path = "/run/secrets/adminpassword";
let password = std::fs::read_to_string(path).unwrap();
password.trim().to_string()
},
};

let node_port = match env::var("NODE_PORT") {
Expand Down
14 changes: 14 additions & 0 deletions omnistrate_tests/classes/omnistrate_fleet_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import json
import os
import logging
import socket
from redis import retry, backoff, exceptions as redis_exceptions

logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(message)s")

Expand Down Expand Up @@ -519,6 +521,18 @@ def create_connection(
username="falkordb",
password=self.falkordb_password,
ssl=ssl,
cluster_error_retry_attempts=20,
retry=retry.Retry(
retries=20,
backoff=backoff.ExponentialBackoff(base=3),
supported_errors=(
ConnectionRefusedError,
ConnectionError,
TimeoutError,
socket.timeout,
redis_exceptions.ConnectionError
),
),
)
break
except Exception as e:
Expand Down
52 changes: 46 additions & 6 deletions omnistrate_tests/test_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import signal
from random import randbytes
from pathlib import Path
import threading


file = Path(__file__).resolve()
parent, root = file.parent, file.parents[1]
Expand Down Expand Up @@ -96,6 +98,7 @@ def test_cluster():
product_tier_key=product_tier.product_tier_key,
resource_key=args.resource_key,
subscription_id=args.subscription_id,
deployment_create_timeout_seconds=2400
)

try:
Expand All @@ -116,13 +119,25 @@ def test_cluster():
hostCount=args.host_count,
clusterReplicas=args.cluster_replicas,
)


thread_signal = threading.Event()
error_signal = threading.Event()
thread = threading.Thread(
target=test_zero_downtime, args=(thread_signal, error_signal, instance, args.tls)
)
thread.start()

if args.ensure_mz_distribution:
test_ensure_mz_distribution(instance, password)

# Test failover and data loss
test_failover(instance)

# Wait for the zero_downtime
thread_signal.set()
thread.join()

# Test stop and start instance
test_stop_start(instance)
except Exception as e:
Expand All @@ -133,7 +148,10 @@ def test_cluster():
# Delete instance
instance.delete(False)

logging.info("Test passed")
if error_signal.is_set():
raise ValueError("Test failed")
else:
logging.info("Test passed")


def test_ensure_mz_distribution(instance: OmnistrateFleetInstance, password: str):
Expand Down Expand Up @@ -228,9 +246,7 @@ def test_failover(instance: OmnistrateFleetInstance):
replica_id=args.replica_id,
wait_for_ready=True,
)

# Check if data is still there


graph = db.select_graph("test")

result = graph.query("MATCH (n:Person) RETURN n")
Expand All @@ -240,8 +256,6 @@ def test_failover(instance: OmnistrateFleetInstance):

logging.info("Data persisted after failover")

graph.delete()


def test_stop_start(instance: OmnistrateFleetInstance):
"""This function should stop the instance, check that it is stopped, then start it again and check that it is running"""
Expand Down Expand Up @@ -274,5 +288,31 @@ def test_stop_start(instance: OmnistrateFleetInstance):
logging.info("Instance started")




def test_zero_downtime(
thread_signal: threading.Event,
error_signal: threading.Event,
instance: OmnistrateFleetInstance,
ssl=False,
):
"""This function should test the ability to read and write while a memory update happens"""
try:
db = instance.create_connection(ssl=ssl, force_reconnect=True)

graph = db.select_graph("test")

while not thread_signal.is_set():
# Write some data to the DB
graph.query("CREATE (n:Person {name: 'Alice'})")
graph.ro_query("MATCH (n:Person {name: 'Alice'}) RETURN n")

time.sleep(3)
except Exception as e:
logging.exception(e)
error_signal.set()
raise e

if __name__ == "__main__":
test_cluster()

Loading

0 comments on commit 7e1d935

Please sign in to comment.