Merge pull request #98 from FalkorDB/92-cluster-make-sure-cluster-sta…

…ys-online-during-all-tests 92 cluster make sure cluster stays online during all tests
FalkorDB · Sep 19, 2024 · 7e1d935 · 7e1d935
2 parents d5200b9 + cc53be7
commit 7e1d935
Show file tree

Hide file tree

Showing 14 changed files with 354 additions and 72 deletions.
diff --git a/.github/workflows/build-test-image.yaml b/.github/workflows/build-test-image.yaml
@@ -258,7 +258,7 @@ jobs:
             subscriptionId: sub-GJPV3NoNC0
             serviceId: ${{ vars.OMNISTRATE_INTERNAL_SERVICE_ID }}
             environmentId: ${{ vars.OMNISTRATE_INTERNAL_DEV_ENVIRONMENT}}
-            extraParams: "--resource-key 'cluster-Single-Zone' --replica-id 'cluster-sz-0' --instance-name 'test-cluster-sz-failover' --instance-description 'test-cluster-sz-failover' --instance-type 'e2-medium' --storage-size '30' --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1'"
+            extraParams: "--resource-key 'cluster-Single-Zone' --replica-id 'cluster-sz-1' --instance-name 'test-cluster-sz-failover' --instance-description 'test-cluster-sz-failover' --instance-type 'e2-medium' --storage-size '30' --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1'"
           - name: PRO/ClusterMultiZone - GCP/us-central1 - Failover & Persistence
             if: "true"
             testFile: test_cluster.py
@@ -268,7 +268,7 @@ jobs:
             subscriptionId: sub-GJPV3NoNC0
             serviceId: ${{ vars.OMNISTRATE_INTERNAL_SERVICE_ID }}
             environmentId: ${{ vars.OMNISTRATE_INTERNAL_DEV_ENVIRONMENT}}
-            extraParams: "--resource-key 'cluster-Multi-Zone' --replica-id 'cluster-mz-0' --instance-name 'test-cluster-mz-failover' --instance-description 'test-cluster-mz-failover' --instance-type 'e2-medium' --storage-size '30' --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1' --ensure-mz-distribution"
+            extraParams: "--resource-key 'cluster-Multi-Zone' --replica-id 'cluster-mz-1' --instance-name 'test-cluster-mz-failover' --instance-description 'test-cluster-mz-failover' --instance-type 'e2-medium' --storage-size '30' --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1' --ensure-mz-distribution"
           - name: PRO/ClusterSingleZone - GCP/us-central1 - Failover & Persistence With TLS
             if: "true"
             testFile: test_cluster.py
@@ -278,7 +278,7 @@ jobs:
             subscriptionId: sub-GJPV3NoNC0
             serviceId: ${{ vars.OMNISTRATE_INTERNAL_SERVICE_ID }}
             environmentId: ${{ vars.OMNISTRATE_INTERNAL_DEV_ENVIRONMENT}}
-            extraParams: "--resource-key 'cluster-Single-Zone' --replica-id 'cluster-sz-0' --instance-name 'test-cluster-sz-failover-tls' --instance-description 'test-cluster-sz-failover' --instance-type 'e2-medium' --storage-size '30' --tls --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1'"
+            extraParams: "--resource-key 'cluster-Single-Zone' --replica-id 'cluster-sz-1' --instance-name 'test-cluster-sz-failover-tls' --instance-description 'test-cluster-sz-failover' --instance-type 'e2-medium' --storage-size '30' --tls --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1'"
           - name: PRO/ClusterMultiZone - GCP/us-central1 - Failover & Persistence With TLS
             if: "true"
             testFile: test_cluster.py
@@ -288,7 +288,7 @@ jobs:
             subscriptionId: sub-GJPV3NoNC0
             serviceId: ${{ vars.OMNISTRATE_INTERNAL_SERVICE_ID }}
             environmentId: ${{ vars.OMNISTRATE_INTERNAL_DEV_ENVIRONMENT}}
-            extraParams: "--resource-key 'cluster-Multi-Zone' --replica-id 'cluster-mz-0' --instance-name 'test-cluster-mz-failover-tls' --instance-description 'test-cluster-mz-failover' --instance-type 'e2-medium' --storage-size '30' --tls --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1' --ensure-mz-distribution"
+            extraParams: "--resource-key 'cluster-Multi-Zone' --replica-id 'cluster-mz-1' --instance-name 'test-cluster-mz-failover-tls' --instance-description 'test-cluster-mz-failover' --instance-type 'e2-medium' --storage-size '30' --tls --rdb-config 'medium' --aof-config 'always' --host-count '6' --cluster-replicas '1' --ensure-mz-distribution"
           # - name: PRO/ClusterSingleZone - GCP/us-central1 - Add/Remove Shards
           #   if: "true"
           #   testFile: test_cluster_shards.py

diff --git a/falkordb-cluster/cluster-entrypoint.sh b/falkordb-cluster/cluster-entrypoint.sh
@@ -2,19 +2,19 @@
 
 FALKORDB_USER=${FALKORDB_USER:-falkordb}
 #FALKORDB_PASSWORD=${FALKORDB_PASSWORD:-''}
-if [[ -f "/run/secrets/falkordbpassword" ]] && [[ -s "/run/secrets/falkordbpassword" ]];then
+if [[ -f "/run/secrets/falkordbpassword" ]] && [[ -s "/run/secrets/falkordbpassword" ]]; then
   FALKORDB_PASSWORD=$(cat "/run/secrets/falkordbpassword")
-elif [[ -n "$FALKORDB_PASSWORD" ]];then
+elif [[ -n "$FALKORDB_PASSWORD" ]]; then
   FALKORDB_PASSWORD=$FALKORDB_PASSWORD
 else
   FALKORDB_PASSWORD=''
 fi
 
 #ADMIN_PASSWORD=${ADMIN_PASSWORD:-''}
-if [[ -f "/run/secrets/adminpassword" ]] && [[ -s "/run/secrets/adminpassword" ]];then
+if [[ -f "/run/secrets/adminpassword" ]] && [[ -s "/run/secrets/adminpassword" ]]; then
   ADMIN_PASSWORD=$(cat "/run/secrets/adminpassword")
   export ADMIN_PASSWORD
-elif [[ -n "$ADMIN_PASSWORD" ]];then
+elif [[ -n "$ADMIN_PASSWORD" ]]; then
   export ADMIN_PASSWORD=$ADMIN_PASSWORD
 else
   export ADMIN_PASSWORD=''
@@ -166,6 +166,11 @@ set_aof_persistence_config() {
   fi
 }
 
+config_rewrite() {
+  echo "Rewriting configuration"
+  redis-cli -p $NODE_PORT $AUTH_CONNECTION_STRING $TLS_CONNECTION_STRING CONFIG REWRITE
+}
+
 create_cluster() {
 
   local urls=""
@@ -256,6 +261,8 @@ set_memory_limit
 set_rdb_persistence_config
 set_aof_persistence_config
 
+config_rewrite
+
 if [[ $NODE_INDEX -eq 0 && ! -f "/data/cluster_initialized" ]]; then
   # Create cluster
   echo "Creating cluster"

diff --git a/falkordb-node/node-entrypoint.sh b/falkordb-node/node-entrypoint.sh
@@ -87,6 +87,7 @@ remove_master_from_group() {
   if [[ $IS_REPLICA -eq 0 && $RUN_SENTINEL -eq 1 ]]; then
     echo "Removing master from sentinel"
     redis-cli -p $SENTINEL_PORT -a $ADMIN_PASSWORD --no-auth-warning $TLS_CONNECTION_STRING SENTINEL failover $MASTER_NAME
+    sleep 5
     tries=5
     while true; do
       master_info=$(redis-cli -a $ADMIN_PASSWORD --no-auth-warning $TLS_CONNECTION_STRING info replication | grep role)
@@ -123,23 +124,6 @@ get_sentinels_list() {
   return $sentinels_list
 }
 
-send_reset_to_sentinels() {
-  echo "Sending reset to sentinels"
-
-  sentinels_list=$1
-  i=0
-  for sentinel in $sentinels_list; do
-    # Wait 30 seconds for the reset to take effect before sending the next reset
-    if [[ $i -gt 0 ]]; then
-      sleep 30
-    fi
-    sentinel_ip=$(echo ${sentinel//:/ } | awk '{print $1}')
-    sentinel_port=$(echo ${sentinel//:/ } | awk '{print $2}')
-    echo "Sending reset to $sentinel_ip:$sentinel_port"
-    redis-cli -h $sentinel_ip -p $sentinel_port -a $ADMIN_PASSWORD --no-auth-warning $TLS_CONNECTION_STRING SENTINEL reset $MASTER_NAME
-  done
-}
-
 # Handle signals
 
 handle_sigterm() {
@@ -165,8 +149,6 @@ handle_sigterm() {
     wait $sentinel_pid
   fi
 
-  # send_reset_to_sentinels $sentinels_list
-
   if [[ $RUN_METRICS -eq 1 && ! -z $redis_exporter_pid ]]; then
     kill -TERM $redis_exporter_pid
   fi

diff --git a/healthcheck_rs/src/main.rs b/healthcheck_rs/src/main.rs
@@ -37,7 +37,12 @@ fn start_health_check_server() {
 fn health_check_handler() -> Result<bool, redis::RedisError> {
     let password = match env::var("ADMIN_PASSWORD") {
         Ok(password) => password,
-        Err(_) => "".to_string(),
+        Err(_) => {
+            // Read from /run/secrets/adminpassword
+            let path = "/run/secrets/adminpassword";
+            let password = std::fs::read_to_string(path).unwrap();
+            password.trim().to_string()
+        },
     };
 
     let node_port = match env::var("NODE_PORT") {

diff --git a/omnistrate_tests/classes/omnistrate_fleet_instance.py b/omnistrate_tests/classes/omnistrate_fleet_instance.py
@@ -2,6 +2,8 @@
 import json
 import os
 import logging
+import socket
+from redis import retry, backoff, exceptions as redis_exceptions
 
 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(message)s")
 
@@ -519,6 +521,18 @@ def create_connection(
                     username="falkordb",
                     password=self.falkordb_password,
                     ssl=ssl,
+                    cluster_error_retry_attempts=20,
+                    retry=retry.Retry(
+                        retries=20,
+                        backoff=backoff.ExponentialBackoff(base=3),
+                        supported_errors=(
+                            ConnectionRefusedError,
+                            ConnectionError,
+                            TimeoutError,
+                            socket.timeout,
+                            redis_exceptions.ConnectionError
+                        ),
+                    ),
                 )
                 break
             except Exception as e:

diff --git a/omnistrate_tests/test_cluster.py b/omnistrate_tests/test_cluster.py
@@ -2,6 +2,8 @@
 import signal
 from random import randbytes
 from pathlib import Path
+import threading
+
 
 file = Path(__file__).resolve()
 parent, root = file.parent, file.parents[1]
@@ -96,6 +98,7 @@ def test_cluster():
         product_tier_key=product_tier.product_tier_key,
         resource_key=args.resource_key,
         subscription_id=args.subscription_id,
+        deployment_create_timeout_seconds=2400
     )
 
     try:
@@ -116,13 +119,25 @@ def test_cluster():
             hostCount=args.host_count,
             clusterReplicas=args.cluster_replicas,
         )
+
+
+        thread_signal = threading.Event()
+        error_signal = threading.Event()
+        thread = threading.Thread(
+            target=test_zero_downtime, args=(thread_signal, error_signal, instance, args.tls)
+        )
+        thread.start()
 
         if args.ensure_mz_distribution:
             test_ensure_mz_distribution(instance, password)
 
         # Test failover and data loss
         test_failover(instance)
 
+        # Wait for the zero_downtime
+        thread_signal.set()
+        thread.join()
+
         # Test stop and start instance
         test_stop_start(instance)
     except Exception as e:
@@ -133,7 +148,10 @@ def test_cluster():
     # Delete instance
     instance.delete(False)
 
-    logging.info("Test passed")
+    if error_signal.is_set():
+        raise ValueError("Test failed")
+    else:
+        logging.info("Test passed")
 
 
 def test_ensure_mz_distribution(instance: OmnistrateFleetInstance, password: str):
@@ -228,9 +246,7 @@ def test_failover(instance: OmnistrateFleetInstance):
         replica_id=args.replica_id,
         wait_for_ready=True,
     )
-
-    # Check if data is still there
-
+
     graph = db.select_graph("test")
 
     result = graph.query("MATCH (n:Person) RETURN n")
@@ -240,8 +256,6 @@ def test_failover(instance: OmnistrateFleetInstance):
 
     logging.info("Data persisted after failover")
 
-    graph.delete()
-
 
 def test_stop_start(instance: OmnistrateFleetInstance):
     """This function should stop the instance, check that it is stopped, then start it again and check that it is running"""
@@ -274,5 +288,31 @@ def test_stop_start(instance: OmnistrateFleetInstance):
     logging.info("Instance started")
 
 
+
+
+def test_zero_downtime(
+    thread_signal: threading.Event,
+    error_signal: threading.Event,
+    instance: OmnistrateFleetInstance,
+    ssl=False,
+):
+    """This function should test the ability to read and write while a memory update happens"""
+    try:
+        db = instance.create_connection(ssl=ssl, force_reconnect=True)
+
+        graph = db.select_graph("test")
+
+        while not thread_signal.is_set():
+            # Write some data to the DB
+            graph.query("CREATE (n:Person {name: 'Alice'})")
+            graph.ro_query("MATCH (n:Person {name: 'Alice'}) RETURN n")
+
+            time.sleep(3)
+    except Exception as e:
+        logging.exception(e)
+        error_signal.set()
+        raise e
+
 if __name__ == "__main__":
     test_cluster()
+