Skip to content

Commit

Permalink
Fix OOM slice removal race
Browse files Browse the repository at this point in the history
If the slice is already removed then we mostly encounter two different
errors:

- `get next line: No such device (os error 19)`
- `open memory events file: /sys/fs/cgroup/test.slice/crio-$ID.scope/memory.events: No such file or directory (os error 2)`

To avoid such a race we now check after the errors if the file still
exists. If not, then we assume an OOM.

Signed-off-by: Sascha Grunert <sgrunert@redhat.com>
  • Loading branch information
saschagrunert committed Aug 2, 2024
1 parent 453ca90 commit 1018592
Showing 1 changed file with 50 additions and 5 deletions.
55 changes: 50 additions & 5 deletions conmon-rs/server/src/oom_watcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,13 +224,33 @@ impl OOMWatcher {
if let Err(e) = tx.try_send(OOMEvent{ oom: false }) {
error!("try_send failed: {:#}", e);
};

debug!("Last resort check for OOM");
match Self::check_for_oom(&memory_events_file_path, last_counter).await {
Ok((counter, is_oom)) => {
if !is_oom {
debug!("No OOM found in event");
break;
}
debug!("Found OOM event count {}", counter);
if let Err(e) = Self::write_oom_files(exit_paths).await {
error!("Writing OOM files failed: {:#}", e)
}
if let Err(e) = tx.try_send(OOMEvent{ oom: true }) {
error!("Try send failed: {:#}", e)
};
}
Err(e) => error!("Checking for OOM failed: {:#}", e),
}

break;
}
Some(res) = rx.recv() => {
match res {
Ok(event) => {
debug!("Got event OOM file event: {:?}", event);
debug!("Got OOM file event: {:?}", event);
if event.kind.is_remove() {
debug!("Got remove event");
if let Err(e) = tx.try_send(OOMEvent{ oom: false }) {
error!("try_send failed: {:#}", e);
};
Expand All @@ -239,9 +259,10 @@ impl OOMWatcher {
match Self::check_for_oom(&memory_events_file_path, last_counter).await {
Ok((counter, is_oom)) => {
if !is_oom {
debug!("No OOM found in event");
continue;
}
debug!(counter, "Found OOM event");
debug!("Found OOM event count {}", counter);
if let Err(e) = Self::write_oom_files(exit_paths).await {
error!("Writing OOM files failed: {:#}", e);
}
Expand All @@ -252,8 +273,19 @@ impl OOMWatcher {
};
}
Err(e) => {
error!("Checking for OOM failed: {}", e);
match tx.try_send(OOMEvent{ oom: false }) {
error!("Checking for OOM failed: {:#}", e);

let mut oom = false;
if Self::file_not_found(&memory_events_file_path).await {
debug!("Assuming memory slice removal race, still reporting one OOM event");
if let Err(e) = Self::write_oom_files(exit_paths).await {
error!("Writing OOM files failed: {:#}", e);
}
last_counter = 1;
oom = true;
}

match tx.try_send(OOMEvent{ oom }) {
Ok(_) => break,
Err(e) => error!("try_send failed: {:#}", e)
};
Expand Down Expand Up @@ -281,11 +313,24 @@ impl OOMWatcher {
Ok(())
}

/// Checks if a file does not exist on disk.
async fn file_not_found(f: impl AsRef<Path>) -> bool {
// TODO: use is_err_and if we can use Rust 1.70.0:
// https://doc.rust-lang.org/std/result/enum.Result.html#method.is_err_and
match tokio::fs::metadata(f).await {
Err(e) => e.kind() == ErrorKind::NotFound,
_ => false,
}
}

async fn check_for_oom(
memory_events_file_path: &Path,
last_counter: u64,
) -> Result<(u64, bool)> {
debug!("Checking for possible OOM");
debug!(
"Checking for possible OOM in {}",
memory_events_file_path.display()
);
let mut new_counter: u64 = 0;
let mut found_oom = false;
let fp = File::open(memory_events_file_path).await.context(format!(
Expand Down

0 comments on commit 1018592

Please sign in to comment.