Merge branch 'pahender/DAOS-16220' into pahender/DAOS-16220_user_test

Skip-unit-tests: true Skip-fault-injection-test: true Features: deployment performance Required-githooks: true Signed-off-by: Phil Henderson <phillip.henderson@intel.com>
daos-stack · Oct 22, 2024 · 9ef8594 · 9ef8594
2 parents 9d0be47 + 2507648
commit 9ef8594
Show file tree

Hide file tree

Showing 23 changed files with 450 additions and 146 deletions.
diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh
@@ -37,5 +37,9 @@ pip install --requirement requirements-utest.txt
 
 pip install /opt/daos/lib/daos/python/
 
+# set high open file limit in the shell to avoid extra warning
+sudo prlimit --nofile=1024:262144 --pid $$
+prlimit -n
+
 ./utils/node_local_test.py --max-log-size 1700MiB --dfuse-dir /localhome/jenkins/ \
     --log-usage-save nltir.xml --log-usage-export nltr.json all
diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md
@@ -44,7 +44,7 @@ Environment variables in this section only apply to the server side.
 |DAOS\_MD\_CAP         |Size of a metadata pmem pool/file in MBs. INTEGER. Default to 128 MB.|
 |DAOS\_START\_POOL\_SVC|Determines whether to start existing pool services when starting a daos\_server. BOOL. Default to true.|
 |CRT\_DISABLE\_MEM\_PIN|Disable memory pinning workaround on a server side. BOOL. Default to 0.|
-|CRT\_EVENT\_DELAY|Delay in seconds before handling each CaRT event. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.|
+|CRT\_EVENT\_DELAY|Delay in seconds before handling a set of CaRT events. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.|
 |DAOS\_SCHED\_PRIO\_DISABLED|Disable server ULT prioritizing. BOOL. Default to 0.|
 |DAOS\_SCHED\_RELAX\_MODE|The mode of CPU relaxing on idle. "disabled":disable relaxing; "net":wait on network request for INTVL; "sleep":sleep for INTVL. STRING. Default to "net"|
 |DAOS\_SCHED\_RELAX\_INTVL|CPU relax interval in milliseconds. INTEGER. Default to 1 ms.|

diff --git a/src/cart/README.env b/src/cart/README.env
@@ -1,13 +1,10 @@
 This file lists the environment variables used in CaRT.
 
  . D_PROVIDER (Deprecated: CRT_PHY_ADDR_STR)
-   It determines which mercury NA plugin to be used:
+   It determines which mercury NA plugin and transport to be used:
    - set it as "ofi+verbs;ofi_rxm" to use OFI verbs;ofi_rxm provider
-   - set it as "ofi+gni" to use OFI gni provider
    - set it as "sm" to use SM plugin which only works within single node
-   - set it as "ofi+tcp;ofi_rxm" to use OFI tcp;ofi_rxm provider.
-   - set it as "ofi+sockets" to use OFI sockets provider
-      NOTE: This provider is deprecated in favor of "ofi+tcp;ofi_rxm"
+   - set it as "ofi+tcp" to use OFI tcp provider.
    - by default (not set or set as any other value) it will use ofi tcp
       provider.
 
@@ -205,3 +202,8 @@ This file lists the environment variables used in CaRT.
    start copying data in an effort to release multi-recv buffers. Copy will occur when at
    most D_MRECV_BUF_COPY buffers remain.
 
+   SWIM_TRAFFIC_CLASS
+   (server only) Select a traffic class for the SWIM protocol to use and prevent potential
+   traffic congestion. Available options are: "unspec" (default), "best_effort",
+   "low_latency", "bulk_data".
+
diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c
@@ -863,6 +863,9 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_
 	init_info.request_post_incr         = crt_gdata.cg_post_incr;
 	init_info.multi_recv_op_max         = crt_gdata.cg_mrecv_buf;
 	init_info.multi_recv_copy_threshold = crt_gdata.cg_mrecv_buf_copy;
+	/* Separate SWIM traffic in an effort to prevent potential congestion. */
+	if (crt_is_service() && ctx_idx == crt_gdata.cg_swim_ctx_idx)
+		init_info.traffic_class = (enum na_traffic_class)crt_gdata.cg_swim_tc;
 
 	hg_class = HG_Init_opt2(info_string, crt_is_service(), HG_VERSION(2, 4), &init_info);
 	if (hg_class == NULL) {

diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c
@@ -18,6 +18,10 @@ static volatile int     gdata_init_flag;
 struct crt_plugin_gdata crt_plugin_gdata;
 static bool             g_prov_settings_applied[CRT_PROV_COUNT];
 
+#define X(a, b) b,
+static const char *const crt_tc_name[] = {CRT_TRAFFIC_CLASSES};
+#undef X
+
 static void
 crt_lib_init(void) __attribute__((__constructor__));
 
@@ -228,6 +232,17 @@ crt_gdata_dump(void)
 	DUMP_GDATA_FIELD("%d", cg_rpc_quota);
 }
 
+static enum crt_traffic_class
+crt_str_to_tc(const char *str)
+{
+	enum crt_traffic_class i = 0;
+
+	while (str != NULL && strcmp(crt_tc_name[i], str) != 0 && i < CRT_TC_UNKNOWN)
+		i++;
+
+	return i == CRT_TC_UNKNOWN ? CRT_TC_UNSPEC : i;
+}
+
 /* first step init - for initializing crt_gdata */
 static int
 data_init(int server, crt_init_options_t *opt)
@@ -238,9 +253,10 @@ data_init(int server, crt_init_options_t *opt)
 	uint32_t     mem_pin_enable = 0;
 	uint32_t     is_secondary;
 	uint32_t     post_init = CRT_HG_POST_INIT, post_incr = CRT_HG_POST_INCR;
-	unsigned int mrecv_buf      = CRT_HG_MRECV_BUF;
-	unsigned int mrecv_buf_copy = 0; /* buf copy disabled by default */
-	int          rc             = 0;
+	unsigned int mrecv_buf          = CRT_HG_MRECV_BUF;
+	unsigned int mrecv_buf_copy     = 0; /* buf copy disabled by default */
+	char        *swim_traffic_class = NULL;
+	int          rc                 = 0;
 
 	crt_env_dump();
 
@@ -253,6 +269,8 @@ data_init(int server, crt_init_options_t *opt)
 	crt_gdata.cg_mrecv_buf = mrecv_buf;
 	crt_env_get(D_MRECV_BUF_COPY, &mrecv_buf_copy);
 	crt_gdata.cg_mrecv_buf_copy = mrecv_buf_copy;
+	crt_env_get(SWIM_TRAFFIC_CLASS, &swim_traffic_class);
+	crt_gdata.cg_swim_tc = crt_str_to_tc(swim_traffic_class);
 
 	is_secondary = 0;
 	/* Apply CART-890 workaround for server side only */
@@ -492,6 +510,61 @@ check_grpid(crt_group_id_t grpid)
 	return rc;
 }
 
+#define CRT_MIN_TCP_FD 131072
+
+/** For some providers, we require a file descriptor for every connection
+ * and some platforms set the soft limit too low meaning and we run out. We can
+ * set the limit up to the configured max by default to avoid this and warn
+ * when that isn't possible.
+ */
+static void
+file_limit_bump(void)
+{
+	int           rc;
+	struct rlimit rlim;
+
+	/* Bump file descriptor limit if low and if possible */
+	rc = getrlimit(RLIMIT_NOFILE, &rlim);
+	if (rc != 0) {
+		DS_ERROR(errno, "getrlimit() failed. Unable to check file descriptor limit");
+		/** Per the man page, this can only fail if rlim is invalid */
+		D_ASSERT(0);
+		return;
+	}
+
+	if (rlim.rlim_cur >= CRT_MIN_TCP_FD)
+		return;
+
+	if (rlim.rlim_max < CRT_MIN_TCP_FD) {
+		if (getuid() != 0) {
+			D_WARN("File descriptor hard limit should be at least %d, limit is %lu\n",
+			       CRT_MIN_TCP_FD, rlim.rlim_max);
+		} else {
+			/** root should be able to change it */
+			D_INFO("Super user attempting to update hard file descriptor limit to %d,"
+			       " limit was %lu\n",
+			       CRT_MIN_TCP_FD, rlim.rlim_max);
+			rlim.rlim_max = CRT_MIN_TCP_FD;
+		}
+
+		if (rlim.rlim_cur >= rlim.rlim_max)
+			return;
+
+		/* May as well bump it as much as we can */
+	}
+
+	rlim.rlim_cur = rlim.rlim_max;
+	rc            = setrlimit(RLIMIT_NOFILE, &rlim);
+	if (rc != 0) {
+		DS_ERROR(errno,
+			 "setrlimit() failed. Unable to bump file descriptor"
+			 " limit to value >= %d, limit is %lu",
+			 CRT_MIN_TCP_FD, rlim.rlim_max);
+		return;
+	}
+	D_INFO("Updated soft file descriptor limit to %lu\n", rlim.rlim_max);
+}
+
 static void
 prov_settings_apply(bool primary, crt_provider_t prov, crt_init_options_t *opt)
 {
@@ -510,6 +583,9 @@ prov_settings_apply(bool primary, crt_provider_t prov, crt_init_options_t *opt)
 			d_setenv("FI_OFI_RXM_DEF_TCP_WAIT_OBJ", "pollfd", 0);
 	}
 
+	if (prov == CRT_PROV_OFI_TCP || prov == CRT_PROV_OFI_TCP_RXM)
+		file_limit_bump();
+
 	if (prov == CRT_PROV_OFI_CXI)
 		mrc_enable = 1;
 

diff --git a/src/cart/crt_internal_types.h b/src/cart/crt_internal_types.h
@@ -42,6 +42,17 @@ struct crt_na_config {
 	char            **noc_domain_str; /* Array of domains */
 };
 
+#define CRT_TRAFFIC_CLASSES                                                                        \
+	X(CRT_TC_UNSPEC, "unspec")           /* Leave it upon plugin to choose */                  \
+	X(CRT_TC_BEST_EFFORT, "best_effort") /* Best effort */                                     \
+	X(CRT_TC_LOW_LATENCY, "low_latency") /* Low latency */                                     \
+	X(CRT_TC_BULK_DATA, "bulk_data")     /* Bulk data */                                       \
+	X(CRT_TC_UNKNOWN, "unknown")         /* Unknown */
+
+#define X(a, b) a,
+enum crt_traffic_class { CRT_TRAFFIC_CLASSES };
+#undef X
+
 struct crt_prov_gdata {
 	/** NA plugin type */
 	int			cpg_provider;
@@ -105,6 +116,9 @@ struct crt_gdata {
 	/** cart context index used by SWIM */
 	int32_t                  cg_swim_ctx_idx;
 
+	/** traffic class used by SWIM */
+	enum crt_traffic_class   cg_swim_tc;
+
 	/** credits limitation for #in-flight RPCs per target EP CTX */
 	uint32_t		cg_credit_ep_ctx;
 
@@ -220,6 +234,7 @@ struct crt_event_cb_priv {
 	ENV(SWIM_PING_TIMEOUT)                                                                     \
 	ENV(SWIM_PROTOCOL_PERIOD_LEN)                                                              \
 	ENV(SWIM_SUSPECT_TIMEOUT)                                                                  \
+	ENV_STR(SWIM_TRAFFIC_CLASS)                                                                \
 	ENV_STR(UCX_IB_FORK_INIT)
 
 /* uint env */

diff --git a/src/cart/crt_iv.c b/src/cart/crt_iv.c
@@ -1,5 +1,5 @@
 /*
- * (C) Copyright 2016-2023 Intel Corporation.
+ * (C) Copyright 2016-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -2911,8 +2911,12 @@ bulk_update_transfer_done_aux(const struct crt_bulk_cb_info *info)
 	return rc;
 
 send_error:
-	rc = crt_bulk_free(cb_info->buc_bulk_hdl);
+	/* send back whatever error got us here */
 	output->rc = rc;
+	rc         = crt_bulk_free(cb_info->buc_bulk_hdl);
+	if (rc != 0)
+		DL_ERROR(rc, "crt_bulk_free() failed");
+
 	iv_ops->ivo_on_put(ivns_internal, &cb_info->buc_iv_value,
 			   cb_info->buc_user_priv);
 

diff --git a/src/control/cmd/dmg/auto_test.go b/src/control/cmd/dmg/auto_test.go
@@ -588,7 +588,7 @@ disable_vfio: false
 disable_vmd: false
 enable_hotplug: false
 nr_hugepages: 0
-system_ram_reserved: 16
+system_ram_reserved: 26
 disable_hugepages: false
 control_log_mask: INFO
 control_log_file: /tmp/daos_server.log

diff --git a/src/control/server/storage/scm.go b/src/control/server/storage/scm.go
@@ -51,7 +51,7 @@ const (
 
 // Memory reservation constant defaults to be used when calculating RAM-disk size for DAOS I/O engine.
 const (
-	DefaultSysMemRsvd    = humanize.GiByte * 16  // per-system
+	DefaultSysMemRsvd    = humanize.GiByte * 26  // per-system
 	DefaultTgtMemRsvd    = humanize.MiByte * 128 // per-engine-target
 	DefaultEngineMemRsvd = humanize.GiByte * 1   // per-engine
 )

diff --git a/src/control/server/storage/scm_test.go b/src/control/server/storage/scm_test.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2023 Intel Corporation.
+// (C) Copyright 2023-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -39,28 +39,28 @@ func Test_CalcRamdiskSize(t *testing.T) {
 			expErr:   errors.New("requires positive nonzero nr engines"),
 		},
 		"default values; low mem": {
-			memTotal: humanize.GiByte * 30,
+			memTotal: humanize.GiByte * 40,
 			memHuge:  humanize.GiByte * 14,
 			memSys:   DefaultSysMemRsvd,
 			tgtCount: 8,
 			engCount: 1,
-			expErr:   errors.New("insufficient ram"), // 30 - (14+16+1) = -1
+			expErr:   errors.New("insufficient ram"), // 30 - (14+26+1) = -1
 		},
 		"default values; high mem": {
-			memTotal: humanize.GiByte * 60,
+			memTotal: humanize.GiByte * 70,
 			memHuge:  humanize.GiByte * 30,
 			memSys:   DefaultSysMemRsvd,
 			tgtCount: 16,
 			engCount: 2,
-			expSize:  humanize.GiByte * 5, // (60 - (30+16+4)) / 2
+			expSize:  humanize.GiByte * 5, // (70 - (30+26+4)) / 2
 		},
 		"default values; low nr targets": {
-			memTotal: humanize.GiByte * 60,
+			memTotal: humanize.GiByte * 70,
 			memHuge:  humanize.GiByte * 30,
 			memSys:   DefaultSysMemRsvd,
 			tgtCount: 1,
 			engCount: 2,
-			expSize:  humanize.GiByte * 6, // (60 - (30+16+2)) / 2
+			expSize:  humanize.GiByte * 6, // (70 - (30+26+2)) / 2
 		},
 		"custom values; low sys reservation": {
 			memTotal: humanize.GiByte * 60,

diff --git a/src/tests/ftest/control/log_entry.py b/src/tests/ftest/control/log_entry.py
@@ -8,7 +8,7 @@
 
 from apricot import TestWithServers
 from ClusterShell.NodeSet import NodeSet
-from general_utils import get_journalctl, journalctl_time, wait_for_result
+from general_utils import journalctl_time, wait_for_result
 from run_utils import run_remote
 
 
@@ -36,16 +36,14 @@ def _verify_journalctl(self, since, expected_messages):
             since (str): start time for journalctl
             expected_messages (list): list of regular expressions to look for
         """
-        self.log_step('Verify journalctl output since {}'.format(since))
+        self.log_step(f'Verify journalctl output since {since}')
 
         not_found = set(expected_messages)
         journalctl_per_hosts = []
 
         def _search():
             """Look for each message in any host's journalctl."""
-            journalctl_results = get_journalctl(
-                hosts=self.hostlist_servers, since=since, until=journalctl_time(),
-                journalctl_type="daos_server")
+            journalctl_results = self.server_managers[0].get_journalctl(since, journalctl_time())
 
             # Convert the journalctl to a dict of hosts : output
             journalctl_per_hosts.append({})
@@ -76,7 +74,7 @@ def _search():
 
         # Fail if any message was not found
         if not_found:
-            fail_msg = '{} messages not found in journalctl'.format(len(not_found))
+            fail_msg = f'{len(not_found)} messages not found in journalctl'
             self.log.error(fail_msg)
             for message in not_found:
                 self.log.error('  %s', message)