forked from youki-dev/youki
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fork.rs
377 lines (344 loc) · 15.9 KB
/
fork.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
use std::{ffi::c_int, fs::File, num::NonZeroUsize};
use libc::SIGCHLD;
use nix::{
sys::{mman, resource},
unistd::Pid,
};
#[derive(Debug, thiserror::Error)]
pub enum CloneError {
#[error("failed to clone process")]
Clone(#[source] nix::Error),
#[error("failed to get system memory page size")]
PageSize(#[source] nix::Error),
#[error("failed to get resource limit")]
ResourceLimit(#[source] nix::Error),
#[error("the stack size is zero")]
ZeroStackSize,
#[error("failed to allocate stack")]
StackAllocation(#[source] nix::Error),
#[error("failed to create stack guard page")]
GuardPage(#[source] nix::Error),
#[error("unknown error code {0}")]
UnknownErrno(i32),
}
/// The callback function used in clone system call. The return value is i32
/// which is consistent with C functions return code. The closure is boxed
/// because we need to store the closure on heap, not stack in the case of
/// `clone`. Unlike `fork` or `clone3`, the `clone` glibc wrapper requires us to
/// pass in a child stack, which is empty. By storing the closure in heap, we
/// can then in the new process to re-box the heap memory back to a closure
/// correctly.
pub type CloneCb = Box<dyn FnOnce() -> i32>;
// Clone a sibling process that shares the same parent as the calling
// process. This is used to launch the container init process so the parent
// process of the calling process can receive ownership of the process. If we
// clone a child process as the init process, the calling process (likely the
// youki main process) will exit and the init process will be re-parented to the
// process 1 (system init process), which is not the right behavior of what we
// look for.
pub fn container_clone_sibling(cb: CloneCb) -> Result<Pid, CloneError> {
// Note: normally, an exit signal is required, but when using
// `CLONE_PARENT`, the `clone3` will return EINVAL if an exit signal is set.
// The older `clone` will not return EINVAL in this case. Instead it ignores
// the exit signal bits in the glibc wrapper. Therefore, we explicitly set
// the exit_signal to None here, so this works for both version of clone.
clone_internal(cb, libc::CLONE_PARENT as u64, None)
}
// Clone a child process and execute the callback.
pub fn container_clone(cb: CloneCb) -> Result<Pid, CloneError> {
clone_internal(cb, 0, Some(SIGCHLD as u64))
}
#[cfg(feature = "no-clone3")]
// We should not use clone3 as required by the no-clone3 feature. Call clone directly.
fn clone_internal(cb: CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, CloneError> {
log::info!("YYYYYY libcontainer::fork >> about to do normal clone");
clone(cb, flags, exit_signal)
}
#[cfg(not(feature = "no-clone3"))]
// Try to use clone3, and fallback to clone in case of ENOSYS.
// Unlike the clone call, clone3 is currently using the kernel syscall.
fn clone_internal(cb: CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, CloneError> {
log::info!("YYYYYY libcontainer::fork >> about to do clone3");
#[repr(C)]
struct Clone3Args {
flags: u64,
pidfd: u64,
child_tid: u64,
parent_tid: u64,
exit_signal: u64,
stack: u64,
stack_size: u64,
tls: u64,
set_tid: u64,
set_tid_size: u64,
cgroup: u64,
}
let mut args = Clone3Args {
flags,
pidfd: 0,
child_tid: 0,
parent_tid: 0,
exit_signal: exit_signal.unwrap_or(0),
stack: 0,
stack_size: 0,
tls: 0,
set_tid: 0,
set_tid_size: 0,
cgroup: 0,
};
let args_ptr = &mut args as *mut Clone3Args;
let args_size = std::mem::size_of::<Clone3Args>();
// For now, we can only use clone3 as a kernel syscall. Libc wrapper is not
// available yet. This can have undefined behavior because libc authors do
// not like people calling kernel syscall to directly create processes. Libc
// does perform additional bookkeeping when calling clone or fork.
// So far we've observed issues when the main process uses threads. In such
// cases enable the `no-clone3` feature.
// See https://github.com/containerd/runwasi/issues/347
match unsafe { libc::syscall(libc::SYS_clone3, args_ptr, args_size) } {
-1 if nix::Error::last() == nix::Error::ENOSYS => clone(cb, flags, exit_signal),
-1 => Err(CloneError::Clone(nix::Error::last())),
0 => {
// Inside the cloned process, we execute the callback and exit with
// the return code.
std::process::exit(cb());
}
ret if ret >= 0 => Ok(Pid::from_raw(ret as i32)),
ret => Err(CloneError::UnknownErrno(ret as i32)),
}
}
fn clone(cb: CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, CloneError> {
const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M
const DEFAULT_PAGE_SIZE: usize = 4 * 1024; // 4K
// Use sysconf to find the page size. If there is an error, we assume
// the default 4K page size.
let page_size = nix::unistd::sysconf(nix::unistd::SysconfVar::PAGE_SIZE)
.map_err(CloneError::PageSize)?
.map(|size| size as usize)
.unwrap_or(DEFAULT_PAGE_SIZE);
// Find out the default stack max size through getrlimit.
let (rlim_cur, _) =
resource::getrlimit(resource::Resource::RLIMIT_STACK).map_err(CloneError::ResourceLimit)?;
// RLIMIT_STACK is an upper bound but it might be set unreasonably high or
// even unlimited, which may cause mmap to error with ENOMEM.
// Most linux system today use a stack size of 8MB.
// Use the most constraining between RLIMIT_STACK or 8MB for the stack size.
let stack_size = rlim_cur.min(DEFAULT_STACK_SIZE as u64) as usize;
// Using the clone syscall requires us to create the stack space for the
// child process instead of taken cared for us like fork call. We use mmap
// here to create the stack. Instead of guessing how much space the child
// process needs, we allocate through mmap. This is OK since mmap will only
// reserve the address space upfront, instead of allocating physical memory
// upfront. The stack will grow as needed, up to the size reserved, so no
// wasted memory here. Lastly, the child stack only needs to support the
// container init process set up code in Youki. When Youki calls exec into
// the container payload, exec will reset the stack.
// Note: do not use MAP_GROWSDOWN since it is not well supported.
// Ref: https://man7.org/linux/man-pages/man2/mmap.2.html
let child_stack = unsafe {
// Since nix = "0.27.1", `mmap()` requires a generic type `F: AsFd`.
// `::<File>` doesn't have any meaning because we won't use it.
mman::mmap::<File>(
None,
NonZeroUsize::new(stack_size).ok_or(CloneError::ZeroStackSize)?,
mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE,
mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK,
None,
0,
)
.map_err(CloneError::StackAllocation)?
};
unsafe {
// Consistent with how pthread_create sets up the stack, we create a
// guard page of 1 page, to protect the child stack collision. Note, for
// clone call, the child stack will grow downward, so the bottom of the
// child stack is in the beginning.
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
.map_err(CloneError::GuardPage)?;
};
// Since the child stack for clone grows downward, we need to pass in
// the top of the stack address.
let child_stack_top = unsafe { child_stack.add(stack_size) };
// Combine the clone flags with exit signals.
let combined_flags = (flags | exit_signal.unwrap_or(0)) as c_int;
// We are passing the boxed closure "cb" into the clone function as the a
// function pointer in C. The box closure in Rust is both a function pointer
// and a struct. However, when casting the box closure into libc::c_void,
// the function pointer will be lost. Therefore, to work around the issue,
// we double box the closure. This is consistent with how std::unix::thread
// handles the closure.
// Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs
let data = Box::into_raw(Box::new(cb));
// The main is a wrapper function passed into clone call below. The "data"
// arg is actually a raw pointer to the Box closure. so here, we re-box the
// pointer back into a box closure so the main takes ownership of the
// memory. Then we can call the closure.
extern "C" fn main(data: *mut libc::c_void) -> c_int {
unsafe { Box::from_raw(data as *mut CloneCb)() }
}
// The nix::sched::clone wrapper doesn't provide the right interface. Using
// the clone syscall is one of the rare cases where we don't want rust to
// manage the child stack memory. Instead, we want to use c_void directly
// here. Therefore, here we are using libc::clone syscall directly for
// better control. The child stack will be cleaned when exec is called or
// the child process terminates. The nix wrapper also does not treat the
// closure memory correctly. The wrapper implementation fails to pass the
// right ownership to the new child process.
// Ref: https://github.com/nix-rust/nix/issues/919
// Ref: https://github.com/nix-rust/nix/pull/920
let ret = unsafe {
libc::clone(
main,
child_stack_top,
combined_flags,
data as *mut libc::c_void,
)
};
// After the clone returns, the heap memory associated with the Box closure
// is duplicated in the cloned process. Therefore, we can safely re-box the
// closure from the raw pointer and let rust to continue managing the
// memory. We call drop here explicitly to avoid the warning that the
// closure is not used. This is correct since the closure is called in the
// cloned process, not the parent process.
unsafe { drop(Box::from_raw(data)) };
match ret {
-1 => Err(CloneError::Clone(nix::Error::last())),
pid if ret > 0 => Ok(Pid::from_raw(pid)),
_ => unreachable!("clone returned a negative pid {ret}"),
}
}
#[cfg(test)]
mod test {
use crate::channel::channel;
use super::*;
use anyhow::{bail, Context, Result};
use nix::sys::wait::{waitpid, WaitStatus};
use nix::unistd;
#[test]
fn test_container_fork() -> Result<()> {
let pid = container_clone(Box::new(|| 0))?;
match waitpid(pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(pid, p);
assert_eq!(status, 0);
Ok(())
}
_ => bail!("test failed"),
}
}
#[test]
fn test_container_err_fork() -> Result<()> {
let pid = container_clone(Box::new(|| -1))?;
match waitpid(pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(pid, p);
assert_eq!(status, 255);
Ok(())
}
_ => bail!("test failed"),
}
}
#[test]
fn test_container_clone_sibling() -> Result<()> {
// The `container_clone_sibling` will create a sibling process (share
// the same parent) of the calling process. In Unix, a process can only
// wait on the immediate children process and can't wait on the sibling
// process. Therefore, to test the logic, we will have to fork a process
// first and then let the forked process call `container_clone_sibling`.
// Then the testing process (the process where test is called), who are
// the parent to this forked process and the sibling process cloned by
// the `container_clone_sibling`, can wait on both processes.
// We need to use a channel so that the forked process can pass the pid
// of the sibling process to the testing process.
let (sender, receiver) = &mut channel::<i32>()?;
match unsafe { unistd::fork()? } {
unistd::ForkResult::Parent { child } => {
let sibling_process_pid =
Pid::from_raw(receiver.recv().with_context(|| {
"failed to receive the sibling pid from forked process"
})?);
receiver.close()?;
match waitpid(sibling_process_pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(sibling_process_pid, p);
assert_eq!(status, 0);
}
_ => bail!("failed to wait on the sibling process"),
}
// After sibling process exits, we can wait on the forked process.
match waitpid(child, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(child, p);
assert_eq!(status, 0);
}
_ => bail!("failed to wait on the forked process"),
}
}
unistd::ForkResult::Child => {
// Inside the forked process. We call `container_clone` and pass
// the pid to the parent process.
let pid = container_clone_sibling(Box::new(|| 0))?;
sender.send(pid.as_raw())?;
sender.close()?;
std::process::exit(0);
}
};
Ok(())
}
// This test depends on libseccomp to work.
#[cfg(feature = "libseccomp")]
#[test]
fn test_clone_fallback() -> Result<()> {
use crate::test_utils::TestCallbackError;
use oci_spec::runtime::{
Arch, LinuxSeccompAction, LinuxSeccompBuilder, LinuxSyscallBuilder,
};
fn has_clone3() -> bool {
// We use the probe syscall to check if the kernel supports clone3 or
// seccomp has successfully blocked clone3.
let res = unsafe { libc::syscall(libc::SYS_clone3, 0, 0) };
let err = (res == -1)
.then(std::io::Error::last_os_error)
.expect("probe syscall should not succeed");
err.raw_os_error() != Some(libc::ENOSYS)
}
// To test the fallback behavior, we will create a seccomp rule that
// blocks `clone3` as ENOSYS.
let syscall = LinuxSyscallBuilder::default()
.names(vec![String::from("clone3")])
.action(LinuxSeccompAction::ScmpActErrno)
.errno_ret(libc::ENOSYS as u32)
.build()?;
let seccomp_profile = LinuxSeccompBuilder::default()
.default_action(LinuxSeccompAction::ScmpActAllow)
.architectures(vec![Arch::ScmpArchNative])
.syscalls(vec![syscall])
.build()?;
crate::test_utils::test_in_child_process(|| {
// We use seccomp to block `clone3`
let _ = prctl::set_no_new_privileges(true);
crate::seccomp::initialize_seccomp(&seccomp_profile)
.expect("failed to initialize seccomp");
if has_clone3() {
return Err(TestCallbackError::Custom(
"clone3 is not blocked by seccomp".into(),
));
}
let pid = container_clone(Box::new(|| 0)).map_err(|err| err.to_string())?;
match waitpid(pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(pid, p);
assert_eq!(status, 0);
}
status => {
return Err(TestCallbackError::Custom(format!(
"failed to wait on child process: {:?}",
status
)));
}
};
Ok(())
})?;
Ok(())
}
}