Skip to content
This repository has been archived by the owner on Sep 15, 2021. It is now read-only.

Commit

Permalink
feat: add 5 min timeout for buckets' comm op
Browse files Browse the repository at this point in the history
  • Loading branch information
NOBLES5E committed Jun 18, 2021
1 parent da8c59d commit d83005d
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 0 deletions.
20 changes: 20 additions & 0 deletions bagua-core-internal/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@ use crate::telemetry::{SCHEDULED_THREAD_POOL, TELEMETRY};
use cpp::cpp;
use datatypes::{BaguaBucket, BaguaTensor};
use events::BaguaEventChannel;
use flume::RecvTimeoutError;
use hashbrown::{HashMap, HashSet};
use std::collections::VecDeque;
use std::fmt::Debug;
use std::sync::Arc;
use std::time::Duration;
use thiserror::Error;

cpp! {{
Expand Down Expand Up @@ -120,6 +122,7 @@ pub struct BaguaCommBackend {
channels: Arc<BaguaCommOpChannels>,
managed_ptrs: HashSet<u64>,
comm_worker: std::thread::JoinHandle<()>,
comm_monitor: std::thread::JoinHandle<()>,
}

impl BaguaCommBackend {
Expand Down Expand Up @@ -168,6 +171,10 @@ impl BaguaCommBackend {

let channels = Arc::new(BaguaCommOpChannels::new(schedule_channel_cap));
let channels_clone = channels.clone();
let (monitor_op_start_channel_sender, monitor_op_start_channel_receiver) =
flume::unbounded();
let (monitor_op_finish_channel_sender, monitor_op_finish_channel_receiver) =
flume::unbounded();

BaguaCommBackend {
ordered_buckets: Default::default(),
Expand All @@ -190,6 +197,7 @@ impl BaguaCommBackend {
"worker received scheduled communication operation {:?}",
comm_op
);
monitor_op_start_channel_sender.send(comm_op.bucket.clone());
for op in &comm_op.ops {
op.execute_background_communication(
comm_op.bucket.clone(),
Expand All @@ -199,6 +207,18 @@ impl BaguaCommBackend {
tracing::debug!("comm op executed: {:?}", comm_op);
comm_op.event_channel.finish();
tracing::debug!("comm op marked finished: {:?}", comm_op);
monitor_op_finish_channel_sender.send(());
}
}),
comm_monitor: std::thread::spawn(move || loop {
let op_bucket = monitor_op_start_channel_receiver
.recv()
.expect("monitor cannot receive next comm op bucket");
match monitor_op_finish_channel_receiver.recv_timeout(Duration::from_secs(300)) {
Ok(_) => {}
Err(_) => {
panic!("{:?} comm op has not finished for 5 min, panic", op_bucket);
}
}
}),
}
Expand Down
8 changes: 8 additions & 0 deletions bagua-core-py/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,14 @@ fn bagua_core(_py: Python, m: &PyModule) -> PyResult<()> {
.init();
color_eyre::install().unwrap();

// panic the whole process when thread panics
let orig_hook = std::panic::take_hook();
std::panic::set_hook(Box::new(move |panic_info| {
// invoke the default handler and exit the process
orig_hook(panic_info);
std::process::exit(1);
}));

m.add_class::<BaguaCommBackendPy>()?;
m.add_class::<BaguaTensorPy>()?;
m.add_class::<BaguaBucketPy>()?;
Expand Down

0 comments on commit d83005d

Please sign in to comment.