Skip to content

Commit

Permalink
selftests/bpf: watchdog timer for test_progs
Browse files Browse the repository at this point in the history
This commit provides a watchdog timer that sets a limit of how long a
single sub-test could run:
- if sub-test runs for 10 seconds, the name of the test is printed
  (currently the name of the test is printed only after it finishes);
- if sub-test runs for 120 seconds, the running thread is terminated
  with SIGSEGV (to trigger crash_handler() and get a stack trace).

Specifically:
- the timer is armed on each call to run_one_test();
- re-armed at each call to test__start_subtest();
- is stopped when exiting run_one_test().

Default timeout could be overridden using '-w' or '--watchdog-timeout'
options. Value 0 can be used to turn the timer off.
Here is an example execution:

    $ ./ssh-exec.sh ./test_progs -w 5 -t \
      send_signal/send_signal_perf_thread_remote,send_signal/send_signal_nmi_thread_remote
    WATCHDOG: test case send_signal/send_signal_nmi_thread_remote executes for 5 seconds, terminating with SIGSEGV
    Caught signal torvalds#11!
    Stack trace:
    ./test_progs(crash_handler+0x1f)[0x9049ef]
    /lib64/libc.so.6(+0x40d00)[0x7f1f1184fd00]
    /lib64/libc.so.6(read+0x4a)[0x7f1f1191cc4a]
    ./test_progs[0x720dd3]
    ./test_progs[0x71ef7a]
    ./test_progs(test_send_signal+0x1db)[0x71edeb]
    ./test_progs[0x9066c5]
    ./test_progs(main+0x5ed)[0x9054ad]
    /lib64/libc.so.6(+0x2a088)[0x7f1f11839088]
    /lib64/libc.so.6(__libc_start_main+0x8b)[0x7f1f1183914b]
    ./test_progs(_start+0x25)[0x527385]
    torvalds#292     send_signal:FAIL
    test_send_signal_common:PASS:reading pipe 0 nsec
    test_send_signal_common:PASS:reading pipe error: size 0 0 nsec
    test_send_signal_common:PASS:incorrect result 0 nsec
    test_send_signal_common:PASS:pipe_write 0 nsec
    test_send_signal_common:PASS:setpriority 0 nsec

Timer is implemented using timer_{create,start} librt API.
Internally librt uses pthreads for SIGEV_THREAD timers,
so this change adds a background timer thread to the test process.
Because of this a few checks in tests 'bpf_iter' and 'iters'
need an update to account for an extra thread.

For parallelized scenario the watchdog is also created for each worker
fork. If one of the workers gets stuck, it would be terminated by a
watchdog. In theory, this might lead to a scenario when all worker
threads are exhausted, however this should not be a problem for
server_main(), as it would exit with some of the tests not run.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
  • Loading branch information
eddyz87 authored and intel-lab-lkp committed Nov 12, 2024
1 parent 47e2c45 commit d748817
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 6 deletions.
8 changes: 4 additions & 4 deletions tools/testing/selftests/bpf/prog_tests/bpf_iter.c
Original file line number Diff line number Diff line change
Expand Up @@ -265,10 +265,10 @@ static void *run_test_task_tid(void *arg)

linfo.task.tid = 0;
linfo.task.pid = getpid();
/* This includes the parent thread, this thread,
/* This includes the parent thread, this thread, watchdog timer thread
* and the do_nothing_wait thread
*/
test_task_common(&opts, 2, 1);
test_task_common(&opts, 3, 1);

test_task_common_nocheck(NULL, &num_unknown_tid, &num_known_tid);
ASSERT_GT(num_unknown_tid, 2, "check_num_unknown_tid");
Expand Down Expand Up @@ -297,7 +297,7 @@ static void test_task_pid(void)
opts.link_info = &linfo;
opts.link_info_len = sizeof(linfo);

test_task_common(&opts, 1, 1);
test_task_common(&opts, 2, 1);
}

static void test_task_pidfd(void)
Expand All @@ -315,7 +315,7 @@ static void test_task_pidfd(void)
opts.link_info = &linfo;
opts.link_info_len = sizeof(linfo);

test_task_common(&opts, 1, 1);
test_task_common(&opts, 2, 1);

close(pidfd);
}
Expand Down
4 changes: 2 additions & 2 deletions tools/testing/selftests/bpf/prog_tests/iters.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,8 @@ static void subtest_task_iters(void)
syscall(SYS_getpgid);
iters_task__detach(skel);
ASSERT_EQ(skel->bss->procs_cnt, 1, "procs_cnt");
ASSERT_EQ(skel->bss->threads_cnt, thread_num + 1, "threads_cnt");
ASSERT_EQ(skel->bss->proc_threads_cnt, thread_num + 1, "proc_threads_cnt");
ASSERT_EQ(skel->bss->threads_cnt, thread_num + 2, "threads_cnt");
ASSERT_EQ(skel->bss->proc_threads_cnt, thread_num + 2, "proc_threads_cnt");
ASSERT_EQ(skel->bss->invalid_cnt, 0, "invalid_cnt");
pthread_mutex_unlock(&do_nothing_mutex);
for (int i = 0; i < thread_num; i++)
Expand Down
104 changes: 104 additions & 0 deletions tools/testing/selftests/bpf/test_progs.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <sys/socket.h>
#include <sys/un.h>
#include <bpf/btf.h>
#include <time.h>
#include "json_writer.h"

#include "network_helpers.h"
Expand Down Expand Up @@ -179,6 +180,88 @@ int usleep(useconds_t usec)
return syscall(__NR_nanosleep, &ts, NULL);
}

/* Watchdog timer is started by watchdog_start() and stopped by watchdog_stop().
* If timer is active for longer than env.secs_till_notify,
* it prints the name of the current test to the stderr.
* If timer is active for longer than env.secs_till_kill,
* it kills the thread executing the test by sending a SIGSEGV signal to it.
*/
static void watchdog_timer_func(union sigval sigval)
{
struct itimerspec timeout = {};
char test_name[256];
int err;

if (env.subtest_state)
snprintf(test_name, sizeof(test_name), "%s/%s",
env.test->test_name, env.subtest_state->name);
else
snprintf(test_name, sizeof(test_name), "%s",
env.test->test_name);

switch (env.watchdog_state) {
case WD_NOTIFY:
fprintf(env.stderr_saved, "WATCHDOG: test case %s executes for %d seconds...\n",
test_name, env.secs_till_notify);
timeout.it_value.tv_sec = env.secs_till_kill - env.secs_till_notify;
env.watchdog_state = WD_KILL;
err = timer_settime(env.watchdog, 0, &timeout, NULL);
if (err)
fprintf(env.stderr_saved, "Failed to arm watchdog timer\n");
break;
case WD_KILL:
fprintf(env.stderr_saved,
"WATCHDOG: test case %s executes for %d seconds, terminating with SIGSEGV\n",
test_name, env.secs_till_kill);
pthread_kill(env.main_thread, SIGSEGV);
break;
}
}

static void watchdog_start(void)
{
struct itimerspec timeout = {};
int err;

if (env.secs_till_kill == 0)
return;
if (env.secs_till_notify > 0) {
env.watchdog_state = WD_NOTIFY;
timeout.it_value.tv_sec = env.secs_till_notify;
} else {
env.watchdog_state = WD_KILL;
timeout.it_value.tv_sec = env.secs_till_kill;
}
err = timer_settime(env.watchdog, 0, &timeout, NULL);
if (err)
fprintf(env.stderr_saved, "Failed to start watchdog timer\n");
}

static void watchdog_stop(void)
{
struct itimerspec timeout = {};
int err;

env.watchdog_state = WD_NOTIFY;
err = timer_settime(env.watchdog, 0, &timeout, NULL);
if (err)
fprintf(env.stderr_saved, "Failed to stop watchdog timer\n");
}

static void watchdog_init(void)
{
struct sigevent watchdog_sev = {
.sigev_notify = SIGEV_THREAD,
.sigev_notify_function = watchdog_timer_func,
};
int err;

env.main_thread = pthread_self();
err = timer_create(CLOCK_MONOTONIC, &watchdog_sev, &env.watchdog);
if (err)
fprintf(stderr, "Failed to initialize watchdog timer\n");
}

static bool should_run(struct test_selector *sel, int num, const char *name)
{
int i;
Expand Down Expand Up @@ -515,6 +598,7 @@ bool test__start_subtest(const char *subtest_name)

env.subtest_state = subtest_state;
stdio_hijack_init(&subtest_state->log_buf, &subtest_state->log_cnt);
watchdog_start();

return true;
}
Expand Down Expand Up @@ -780,6 +864,7 @@ enum ARG_KEYS {
ARG_DEBUG = -1,
ARG_JSON_SUMMARY = 'J',
ARG_TRAFFIC_MONITOR = 'm',
ARG_WATCHDOG_TIMEOUT = 'w',
};

static const struct argp_option opts[] = {
Expand Down Expand Up @@ -810,6 +895,8 @@ static const struct argp_option opts[] = {
{ "traffic-monitor", ARG_TRAFFIC_MONITOR, "NAMES", 0,
"Monitor network traffic of tests with name matching the pattern (supports '*' wildcard)." },
#endif
{ "watchdog-timeout", ARG_WATCHDOG_TIMEOUT, "SECONDS", 0,
"Kill the process if tests are not making progress for specified number of seconds." },
{},
};

Expand Down Expand Up @@ -1035,6 +1122,16 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
true);
break;
#endif
case ARG_WATCHDOG_TIMEOUT:
env->secs_till_kill = atoi(arg);
if (env->secs_till_kill < 0) {
fprintf(stderr, "Invalid watchdog timeout: %s.\n", arg);
return -EINVAL;
}
if (env->secs_till_kill < env->secs_till_notify) {
env->secs_till_notify = 0;
}
break;
default:
return ARGP_ERR_UNKNOWN;
}
Expand Down Expand Up @@ -1263,10 +1360,12 @@ static void run_one_test(int test_num)

stdio_hijack(&state->log_buf, &state->log_cnt);

watchdog_start();
if (test->run_test)
test->run_test();
else if (test->run_serial_test)
test->run_serial_test();
watchdog_stop();

/* ensure last sub-test is finalized properly */
if (env.subtest_state)
Expand Down Expand Up @@ -1707,6 +1806,7 @@ static int worker_main_send_subtests(int sock, struct test_state *state)
static int worker_main(int sock)
{
save_netns();
watchdog_init();

while (true) {
/* receive command */
Expand Down Expand Up @@ -1816,6 +1916,8 @@ int main(int argc, char **argv)

sigaction(SIGSEGV, &sigact, NULL);

env.secs_till_notify = 10;
env.secs_till_kill = 120;
err = argp_parse(&argp, argc, argv, 0, NULL, &env);
if (err)
return err;
Expand All @@ -1824,6 +1926,8 @@ int main(int argc, char **argv)
if (err)
return err;

watchdog_init();

/* Use libbpf 1.0 API mode */
libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
libbpf_set_print(libbpf_print_fn);
Expand Down
6 changes: 6 additions & 0 deletions tools/testing/selftests/bpf/test_progs.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,12 @@ struct test_env {
pid_t *worker_pids; /* array of worker pids */
int *worker_socks; /* array of worker socks */
int *worker_current_test; /* array of current running test for each worker */

pthread_t main_thread;
int secs_till_notify;
int secs_till_kill;
timer_t watchdog; /* watch for stalled tests/subtests */
enum { WD_NOTIFY, WD_KILL } watchdog_state;
};

#define MAX_LOG_TRUNK_SIZE 8192
Expand Down

0 comments on commit d748817

Please sign in to comment.