Skip to content

Commit

Permalink
Merge branch 'pahender/DAOS-16220' into pahender/DAOS-16220_user_test
Browse files Browse the repository at this point in the history
Skip-unit-tests: true
Skip-fault-injection-test: true
Features: deployment performance

Required-githooks: true

Signed-off-by: Phil Henderson <phillip.henderson@intel.com>
  • Loading branch information
phender committed Oct 22, 2024
2 parents 9d0be47 + 2507648 commit 9ef8594
Show file tree
Hide file tree
Showing 23 changed files with 450 additions and 146 deletions.
4 changes: 4 additions & 0 deletions ci/unit/test_nlt_node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,9 @@ pip install --requirement requirements-utest.txt

pip install /opt/daos/lib/daos/python/

# set high open file limit in the shell to avoid extra warning
sudo prlimit --nofile=1024:262144 --pid $$
prlimit -n

./utils/node_local_test.py --max-log-size 1700MiB --dfuse-dir /localhome/jenkins/ \
--log-usage-save nltir.xml --log-usage-export nltr.json all
2 changes: 1 addition & 1 deletion docs/admin/env_variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Environment variables in this section only apply to the server side.
|DAOS\_MD\_CAP |Size of a metadata pmem pool/file in MBs. INTEGER. Default to 128 MB.|
|DAOS\_START\_POOL\_SVC|Determines whether to start existing pool services when starting a daos\_server. BOOL. Default to true.|
|CRT\_DISABLE\_MEM\_PIN|Disable memory pinning workaround on a server side. BOOL. Default to 0.|
|CRT\_EVENT\_DELAY|Delay in seconds before handling each CaRT event. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.|
|CRT\_EVENT\_DELAY|Delay in seconds before handling a set of CaRT events. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.|
|DAOS\_SCHED\_PRIO\_DISABLED|Disable server ULT prioritizing. BOOL. Default to 0.|
|DAOS\_SCHED\_RELAX\_MODE|The mode of CPU relaxing on idle. "disabled":disable relaxing; "net":wait on network request for INTVL; "sleep":sleep for INTVL. STRING. Default to "net"|
|DAOS\_SCHED\_RELAX\_INTVL|CPU relax interval in milliseconds. INTEGER. Default to 1 ms.|
Expand Down
12 changes: 7 additions & 5 deletions src/cart/README.env
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
This file lists the environment variables used in CaRT.

. D_PROVIDER (Deprecated: CRT_PHY_ADDR_STR)
It determines which mercury NA plugin to be used:
It determines which mercury NA plugin and transport to be used:
- set it as "ofi+verbs;ofi_rxm" to use OFI verbs;ofi_rxm provider
- set it as "ofi+gni" to use OFI gni provider
- set it as "sm" to use SM plugin which only works within single node
- set it as "ofi+tcp;ofi_rxm" to use OFI tcp;ofi_rxm provider.
- set it as "ofi+sockets" to use OFI sockets provider
NOTE: This provider is deprecated in favor of "ofi+tcp;ofi_rxm"
- set it as "ofi+tcp" to use OFI tcp provider.
- by default (not set or set as any other value) it will use ofi tcp
provider.

Expand Down Expand Up @@ -205,3 +202,8 @@ This file lists the environment variables used in CaRT.
start copying data in an effort to release multi-recv buffers. Copy will occur when at
most D_MRECV_BUF_COPY buffers remain.

SWIM_TRAFFIC_CLASS
(server only) Select a traffic class for the SWIM protocol to use and prevent potential
traffic congestion. Available options are: "unspec" (default), "best_effort",
"low_latency", "bulk_data".

3 changes: 3 additions & 0 deletions src/cart/crt_hg.c
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,9 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_
init_info.request_post_incr = crt_gdata.cg_post_incr;
init_info.multi_recv_op_max = crt_gdata.cg_mrecv_buf;
init_info.multi_recv_copy_threshold = crt_gdata.cg_mrecv_buf_copy;
/* Separate SWIM traffic in an effort to prevent potential congestion. */
if (crt_is_service() && ctx_idx == crt_gdata.cg_swim_ctx_idx)
init_info.traffic_class = (enum na_traffic_class)crt_gdata.cg_swim_tc;

hg_class = HG_Init_opt2(info_string, crt_is_service(), HG_VERSION(2, 4), &init_info);
if (hg_class == NULL) {
Expand Down
82 changes: 79 additions & 3 deletions src/cart/crt_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ static volatile int gdata_init_flag;
struct crt_plugin_gdata crt_plugin_gdata;
static bool g_prov_settings_applied[CRT_PROV_COUNT];

#define X(a, b) b,
static const char *const crt_tc_name[] = {CRT_TRAFFIC_CLASSES};
#undef X

static void
crt_lib_init(void) __attribute__((__constructor__));

Expand Down Expand Up @@ -228,6 +232,17 @@ crt_gdata_dump(void)
DUMP_GDATA_FIELD("%d", cg_rpc_quota);
}

static enum crt_traffic_class
crt_str_to_tc(const char *str)
{
enum crt_traffic_class i = 0;

while (str != NULL && strcmp(crt_tc_name[i], str) != 0 && i < CRT_TC_UNKNOWN)
i++;

return i == CRT_TC_UNKNOWN ? CRT_TC_UNSPEC : i;
}

/* first step init - for initializing crt_gdata */
static int
data_init(int server, crt_init_options_t *opt)
Expand All @@ -238,9 +253,10 @@ data_init(int server, crt_init_options_t *opt)
uint32_t mem_pin_enable = 0;
uint32_t is_secondary;
uint32_t post_init = CRT_HG_POST_INIT, post_incr = CRT_HG_POST_INCR;
unsigned int mrecv_buf = CRT_HG_MRECV_BUF;
unsigned int mrecv_buf_copy = 0; /* buf copy disabled by default */
int rc = 0;
unsigned int mrecv_buf = CRT_HG_MRECV_BUF;
unsigned int mrecv_buf_copy = 0; /* buf copy disabled by default */
char *swim_traffic_class = NULL;
int rc = 0;

crt_env_dump();

Expand All @@ -253,6 +269,8 @@ data_init(int server, crt_init_options_t *opt)
crt_gdata.cg_mrecv_buf = mrecv_buf;
crt_env_get(D_MRECV_BUF_COPY, &mrecv_buf_copy);
crt_gdata.cg_mrecv_buf_copy = mrecv_buf_copy;
crt_env_get(SWIM_TRAFFIC_CLASS, &swim_traffic_class);
crt_gdata.cg_swim_tc = crt_str_to_tc(swim_traffic_class);

is_secondary = 0;
/* Apply CART-890 workaround for server side only */
Expand Down Expand Up @@ -492,6 +510,61 @@ check_grpid(crt_group_id_t grpid)
return rc;
}

#define CRT_MIN_TCP_FD 131072

/** For some providers, we require a file descriptor for every connection
* and some platforms set the soft limit too low meaning and we run out. We can
* set the limit up to the configured max by default to avoid this and warn
* when that isn't possible.
*/
static void
file_limit_bump(void)
{
int rc;
struct rlimit rlim;

/* Bump file descriptor limit if low and if possible */
rc = getrlimit(RLIMIT_NOFILE, &rlim);
if (rc != 0) {
DS_ERROR(errno, "getrlimit() failed. Unable to check file descriptor limit");
/** Per the man page, this can only fail if rlim is invalid */
D_ASSERT(0);
return;
}

if (rlim.rlim_cur >= CRT_MIN_TCP_FD)
return;

if (rlim.rlim_max < CRT_MIN_TCP_FD) {
if (getuid() != 0) {
D_WARN("File descriptor hard limit should be at least %d, limit is %lu\n",
CRT_MIN_TCP_FD, rlim.rlim_max);
} else {
/** root should be able to change it */
D_INFO("Super user attempting to update hard file descriptor limit to %d,"
" limit was %lu\n",
CRT_MIN_TCP_FD, rlim.rlim_max);
rlim.rlim_max = CRT_MIN_TCP_FD;
}

if (rlim.rlim_cur >= rlim.rlim_max)
return;

/* May as well bump it as much as we can */
}

rlim.rlim_cur = rlim.rlim_max;
rc = setrlimit(RLIMIT_NOFILE, &rlim);
if (rc != 0) {
DS_ERROR(errno,
"setrlimit() failed. Unable to bump file descriptor"
" limit to value >= %d, limit is %lu",
CRT_MIN_TCP_FD, rlim.rlim_max);
return;
}
D_INFO("Updated soft file descriptor limit to %lu\n", rlim.rlim_max);
}

static void
prov_settings_apply(bool primary, crt_provider_t prov, crt_init_options_t *opt)
{
Expand All @@ -510,6 +583,9 @@ prov_settings_apply(bool primary, crt_provider_t prov, crt_init_options_t *opt)
d_setenv("FI_OFI_RXM_DEF_TCP_WAIT_OBJ", "pollfd", 0);
}

if (prov == CRT_PROV_OFI_TCP || prov == CRT_PROV_OFI_TCP_RXM)
file_limit_bump();

if (prov == CRT_PROV_OFI_CXI)
mrc_enable = 1;

Expand Down
15 changes: 15 additions & 0 deletions src/cart/crt_internal_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,17 @@ struct crt_na_config {
char **noc_domain_str; /* Array of domains */
};

#define CRT_TRAFFIC_CLASSES \
X(CRT_TC_UNSPEC, "unspec") /* Leave it upon plugin to choose */ \
X(CRT_TC_BEST_EFFORT, "best_effort") /* Best effort */ \
X(CRT_TC_LOW_LATENCY, "low_latency") /* Low latency */ \
X(CRT_TC_BULK_DATA, "bulk_data") /* Bulk data */ \
X(CRT_TC_UNKNOWN, "unknown") /* Unknown */

#define X(a, b) a,
enum crt_traffic_class { CRT_TRAFFIC_CLASSES };
#undef X

struct crt_prov_gdata {
/** NA plugin type */
int cpg_provider;
Expand Down Expand Up @@ -105,6 +116,9 @@ struct crt_gdata {
/** cart context index used by SWIM */
int32_t cg_swim_ctx_idx;

/** traffic class used by SWIM */
enum crt_traffic_class cg_swim_tc;

/** credits limitation for #in-flight RPCs per target EP CTX */
uint32_t cg_credit_ep_ctx;

Expand Down Expand Up @@ -220,6 +234,7 @@ struct crt_event_cb_priv {
ENV(SWIM_PING_TIMEOUT) \
ENV(SWIM_PROTOCOL_PERIOD_LEN) \
ENV(SWIM_SUSPECT_TIMEOUT) \
ENV_STR(SWIM_TRAFFIC_CLASS) \
ENV_STR(UCX_IB_FORK_INIT)

/* uint env */
Expand Down
8 changes: 6 additions & 2 deletions src/cart/crt_iv.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2016-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -2911,8 +2911,12 @@ bulk_update_transfer_done_aux(const struct crt_bulk_cb_info *info)
return rc;

send_error:
rc = crt_bulk_free(cb_info->buc_bulk_hdl);
/* send back whatever error got us here */
output->rc = rc;
rc = crt_bulk_free(cb_info->buc_bulk_hdl);
if (rc != 0)
DL_ERROR(rc, "crt_bulk_free() failed");

iv_ops->ivo_on_put(ivns_internal, &cb_info->buc_iv_value,
cb_info->buc_user_priv);

Expand Down
2 changes: 1 addition & 1 deletion src/control/cmd/dmg/auto_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ disable_vfio: false
disable_vmd: false
enable_hotplug: false
nr_hugepages: 0
system_ram_reserved: 16
system_ram_reserved: 26
disable_hugepages: false
control_log_mask: INFO
control_log_file: /tmp/daos_server.log
Expand Down
2 changes: 1 addition & 1 deletion src/control/server/storage/scm.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ const (

// Memory reservation constant defaults to be used when calculating RAM-disk size for DAOS I/O engine.
const (
DefaultSysMemRsvd = humanize.GiByte * 16 // per-system
DefaultSysMemRsvd = humanize.GiByte * 26 // per-system
DefaultTgtMemRsvd = humanize.MiByte * 128 // per-engine-target
DefaultEngineMemRsvd = humanize.GiByte * 1 // per-engine
)
Expand Down
14 changes: 7 additions & 7 deletions src/control/server/storage/scm_test.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2023 Intel Corporation.
// (C) Copyright 2023-2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -39,28 +39,28 @@ func Test_CalcRamdiskSize(t *testing.T) {
expErr: errors.New("requires positive nonzero nr engines"),
},
"default values; low mem": {
memTotal: humanize.GiByte * 30,
memTotal: humanize.GiByte * 40,
memHuge: humanize.GiByte * 14,
memSys: DefaultSysMemRsvd,
tgtCount: 8,
engCount: 1,
expErr: errors.New("insufficient ram"), // 30 - (14+16+1) = -1
expErr: errors.New("insufficient ram"), // 30 - (14+26+1) = -1
},
"default values; high mem": {
memTotal: humanize.GiByte * 60,
memTotal: humanize.GiByte * 70,
memHuge: humanize.GiByte * 30,
memSys: DefaultSysMemRsvd,
tgtCount: 16,
engCount: 2,
expSize: humanize.GiByte * 5, // (60 - (30+16+4)) / 2
expSize: humanize.GiByte * 5, // (70 - (30+26+4)) / 2
},
"default values; low nr targets": {
memTotal: humanize.GiByte * 60,
memTotal: humanize.GiByte * 70,
memHuge: humanize.GiByte * 30,
memSys: DefaultSysMemRsvd,
tgtCount: 1,
engCount: 2,
expSize: humanize.GiByte * 6, // (60 - (30+16+2)) / 2
expSize: humanize.GiByte * 6, // (70 - (30+26+2)) / 2
},
"custom values; low sys reservation": {
memTotal: humanize.GiByte * 60,
Expand Down
10 changes: 4 additions & 6 deletions src/tests/ftest/control/log_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from apricot import TestWithServers
from ClusterShell.NodeSet import NodeSet
from general_utils import get_journalctl, journalctl_time, wait_for_result
from general_utils import journalctl_time, wait_for_result
from run_utils import run_remote


Expand Down Expand Up @@ -36,16 +36,14 @@ def _verify_journalctl(self, since, expected_messages):
since (str): start time for journalctl
expected_messages (list): list of regular expressions to look for
"""
self.log_step('Verify journalctl output since {}'.format(since))
self.log_step(f'Verify journalctl output since {since}')

not_found = set(expected_messages)
journalctl_per_hosts = []

def _search():
"""Look for each message in any host's journalctl."""
journalctl_results = get_journalctl(
hosts=self.hostlist_servers, since=since, until=journalctl_time(),
journalctl_type="daos_server")
journalctl_results = self.server_managers[0].get_journalctl(since, journalctl_time())

# Convert the journalctl to a dict of hosts : output
journalctl_per_hosts.append({})
Expand Down Expand Up @@ -76,7 +74,7 @@ def _search():

# Fail if any message was not found
if not_found:
fail_msg = '{} messages not found in journalctl'.format(len(not_found))
fail_msg = f'{len(not_found)} messages not found in journalctl'
self.log.error(fail_msg)
for message in not_found:
self.log.error(' %s', message)
Expand Down
Loading

0 comments on commit 9ef8594

Please sign in to comment.