[test]Add rolling update test (milvus-io#22144)

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com> Co-authored-by: Sheldon <chuanfeng.liu@zilliz.com>
cydrain · Feb 14, 2023 · 33de788 · 33de788
1 parent e5a6d90
commit 33de788
Show file tree

Hide file tree

Showing 6 changed files with 108 additions and 8 deletions.
diff --git a/tests/python_client/chaos/chaos_commons.py b/tests/python_client/chaos/chaos_commons.py
@@ -87,17 +87,17 @@ def reconnect(connections, alias='default', timeout=360):
     return connections.connect(alias)
 
 
-def assert_statistic(checkers, expectations={}):
+def assert_statistic(checkers, expectations={}, succ_rate_threshold=0.95, fail_rate_threshold=0.49):
     for k in checkers.keys():
         # expect succ if no expectations
         succ_rate = checkers[k].succ_rate()
         total = checkers[k].total()
         average_time = checkers[k].average_time
         if expectations.get(k, '') == constants.FAIL:
             log.info(f"Expect Fail: {str(k)} succ rate {succ_rate}, total: {total}, average time: {average_time:.4f}")
-            expect(succ_rate < 0.49 or total < 2,
+            expect(succ_rate < fail_rate_threshold or total < 2,
                    f"Expect Fail: {str(k)} succ rate {succ_rate}, total: {total}, average time: {average_time:.4f}")
         else:
             log.info(f"Expect Succ: {str(k)} succ rate {succ_rate}, total: {total}, average time: {average_time:.4f}")
-            expect(succ_rate > 0.90 and total > 2,
+            expect(succ_rate > succ_rate_threshold and total > 2,
                    f"Expect Succ: {str(k)} succ rate {succ_rate}, total: {total}, average time: {average_time:.4f}")
diff --git a/tests/python_client/chaos/checker.py b/tests/python_client/chaos/checker.py
@@ -39,14 +39,14 @@ def trace(fmt=DEFAULT_FMT, prefix='chaos-test', flag=True):
     def decorate(func):
         @functools.wraps(func)
         def inner_wrapper(self, *args, **kwargs):
-            start_time = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
+            start_time = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
             t0 = time.perf_counter()
             res, result = func(self, *args, **kwargs)
             elapsed = time.perf_counter() - t0
-            end_time = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
+            end_time = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
+            operation_name = func.__name__
             if flag:
                 collection_name = self.c_wrap.name
-                operation_name = func.__name__
                 log_str = f"[{prefix}]" + fmt.format(**locals())
                 # TODO: add report function in this place, like uploading to influxdb
                 # it is better a async way to do this, in case of blocking the request processing
@@ -56,8 +56,12 @@ def inner_wrapper(self, *args, **kwargs):
                 self.average_time = (
                     elapsed + self.average_time * self._succ) / (self._succ + 1)
                 self._succ += 1
+                if len(self.fail_records) > 0 and self.fail_records[-1][0] == "failure" and \
+                        self._succ + self._fail == self.fail_records[-1][1] + 1:
+                    self.fail_records.append(("success", self._succ + self._fail, start_time))
             else:
                 self._fail += 1
+                self.fail_records.append(("failure", self._succ + self._fail, start_time))
             return res, result
         return inner_wrapper
     return decorate
@@ -91,6 +95,7 @@ class Checker:
     def __init__(self, collection_name=None, shards_num=2, dim=ct.default_dim):
         self._succ = 0
         self._fail = 0
+        self.fail_records = []
         self._keep_running = True
         self.rsp_times = []
         self.average_time = 0
@@ -126,6 +131,8 @@ def check_result(self):
         checkers_result = f"{checker_name}, succ_rate: {succ_rate:.2f}, total: {total:03d}, average_time: {average_time:.4f}, max_time: {max_time:.4f}, min_time: {min_time:.4f}"
         log.info(checkers_result)
         log.info(f"{checker_name} rsp times: {self.rsp_times}")
+        if len(self.fail_records) > 0:
+            log.info(f"{checker_name} failed at {self.fail_records}")
         return checkers_result
 
     def terminate(self):

diff --git a/tests/python_client/chaos/testcases/test_single_request_operation.py b/tests/python_client/chaos/testcases/test_single_request_operation.py
@@ -78,6 +78,6 @@ def test_operations(self, request_duration, is_check):
             for k,v in self.health_checkers.items():
                 v.check_result()
         if is_check:
-            assert_statistic(self.health_checkers)
+            assert_statistic(self.health_checkers, succ_rate_threshold=0.98)
             assert_expectations()
         log.info("*********************Chaos Test Completed**********************")
diff --git a/tests/python_client/deploy/scripts/breakdown_rolling_update.py b/tests/python_client/deploy/scripts/breakdown_rolling_update.py
@@ -0,0 +1,53 @@
+import psutil
+import time
+from loguru import logger
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description='config for rolling update process')
+    parser.add_argument('--wait_time', type=int, default=60, help='wait time after rolling update started')
+    args = parser.parse_args()
+    wait_time = args.wait_time
+    logger.info("start to watch rolling update process")
+    start_time = time.time()
+    end_time = time.time()
+    flag = False
+    while not flag and end_time - start_time < 360: 
+        process_list = [p.info for p in psutil.process_iter(attrs=['pid', 'name','cmdline'])]
+        for process in process_list:
+            logger.debug(process)
+            logger.debug("##"*30)
+        for process in process_list:
+            if isinstance(process.get("cmdline", []), list):
+                cmdline_list = process.get("cmdline", [])
+                for cmdline in cmdline_list:
+                    if "rollingUpdate.sh" in cmdline:
+                        logger.info(f"rolling update process: {process} started")
+                        flag = True
+                        break
+            if flag:
+                break
+        time.sleep(0.5)
+        end_time = time.time()
+        if not flag:
+            logger.info(f"rolling update process not found, wait for {end_time - start_time} seconds")
+        else:
+            logger.info(f"rolling update process {process} found, wait for {end_time - start_time} seconds")
+    if flag:
+        logger.info(f"wait {wait_time}s to kill rolling update process")
+        time.sleep(wait_time)
+        logger.info("start to kill rolling update process")
+        try:
+            p = psutil.Process(process["pid"])
+            p.terminate()
+            logger.info(f"rolling update process: {process} killed")
+        except Exception as e:
+            logger.error(f"rolling update process: {process} kill failed, {e}")
+    else:
+        logger.info("all process info")
+        for process in process_list:
+            logger.info(process)
+
+
+
diff --git a/tests/python_client/requirements.txt b/tests/python_client/requirements.txt
@@ -31,11 +31,14 @@ pytest-random-order
 python-benedict==0.24.3
 timeout-decorator==0.5.0
 
-# for bulk load test
+# for bulk insert test
 minio==7.1.5
 
 # for benchmark
 h5py==3.7.0
 
 # for log
 loguru==0.6.0
+
+# util
+psutil==5.8.0
diff --git a/tests/scripts/breakdown_rolling_update.py b/tests/scripts/breakdown_rolling_update.py
@@ -0,0 +1,37 @@
+import psutil
+import time
+from loguru import logger
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description='config for rolling update process')
+    parser.add_argument('--wait_time', type=int, default=60, help='wait time after rolling update started')
+    args = parser.parse_args()
+    wait_time = args.wait_time
+    logger.info("start to watch rolling update process")
+    start_time = time.time()
+    end_time = time.time()
+    flag = True
+    while flag and end_time - start_time < 360: 
+        process_list = [p.info for p in psutil.process_iter(attrs=['pid', 'name','cmdline'])]
+        for process in process_list:
+            if isinstance(process.get("cmdline", []), list):
+                if "rollingUpdate.sh" in process.get("cmdline", []):
+                    logger.info(f"rolling update process: {process} started")
+                    flag = False
+                    break
+        time.sleep(0.5)
+        end_time = time.time()
+        if flag:
+            logger.info(f"rolling update process not found, wait for {end_time - start_time} seconds")
+    logger.info(f"wait {wait_time}s to kill rolling update process")
+    time.sleep(wait_time)
+    logger.info("start to kill rolling update process")
+    try:
+        p = psutil.Process(process["pid"])
+        p.terminate()
+        logger.info(f"rolling update process: {process} killed")
+    except Exception as e:
+        logger.error(f"rolling update process: {process} kill failed, {e}")
+