Skip to content

Commit

Permalink
PR feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
jgallagher committed Jun 24, 2022
1 parent 5e84be2 commit 6740bff
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 17 deletions.
38 changes: 30 additions & 8 deletions sled-agent/src/bootstrap/agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use omicron_common::backoff::{
internal_service_policy, retry_notify, BackoffError,
};
use slog::Logger;
use std::collections::HashSet;
use std::net::{Ipv6Addr, SocketAddrV6};
use std::path::{Path, PathBuf};
use std::sync::Arc;
Expand Down Expand Up @@ -58,6 +59,9 @@ pub enum BootstrapError {
#[error(transparent)]
TrustQuorum(#[from] TrustQuorumError),

#[error("Error collecting peer addresses: {0}")]
PeerAddresses(String),

#[error("Failed to initialize bootstrap address: {err}")]
BootstrapAddress { err: crate::illumos::zone::EnsureGzAddressError },
}
Expand Down Expand Up @@ -299,12 +303,29 @@ impl Agent {
let rack_secret = retry_notify(
internal_service_policy(),
|| async {
let other_agents = ddm_admin_client
.peer_addrs()
.await
.map_err(BootstrapError::DdmError)
.map_err(|err| BackoffError::transient(err))?
.collect::<Vec<_>>();
let other_agents = {
// Manually build up a `HashSet` instead of `.collect()`ing
// so we can log if we see any duplicates.
let mut addrs = HashSet::new();
for addr in ddm_admin_client
.peer_addrs()
.await
.map_err(BootstrapError::DdmError)
.map_err(|err| BackoffError::transient(err))?
{
// We should never see duplicates; that would mean
// maghemite thinks two different sleds have the same
// bootstrap address!
if !addrs.insert(addr) {
let msg = format!("Duplicate peer addresses received from ddmd: {addr}");
error!(&self.log, "{}", msg);
return Err(BackoffError::permanent(
BootstrapError::PeerAddresses(msg),
));
}
}
addrs
};
info!(
&self.log,
"Bootstrap: Communicating with peers: {:?}", other_agents
Expand Down Expand Up @@ -348,8 +369,9 @@ impl Agent {
})
.collect();

// TODO: Parallelize this and keep track of whose shares we've already retrieved and
// don't resend. See https://github.com/oxidecomputer/omicron/issues/514
// TODO: Parallelize this and keep track of whose shares we've
// already retrieved and don't resend. See
// https://github.com/oxidecomputer/omicron/issues/514
let mut shares = vec![share.share.clone()];
for agent in &other_agents {
let share = agent.request_share().await
Expand Down
8 changes: 3 additions & 5 deletions sled-agent/src/bootstrap/ddm_admin_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,9 @@ impl DdmAdminClient {
let prefixes = self.client.get_prefixes().await?.into_inner();
info!(self.log, "Received prefixes from ddmd"; "prefixes" => ?prefixes);
Ok(prefixes.into_iter().filter_map(|(_, prefixes)| {
// TODO-correctness What if a single peer is advertising multiple
// bootstrap network prefixes? This will only grab the first. We
// could use `flat_map` instead of `filter_map`, but then our caller
// wouldn't be able to tell "one peer with 3 prefixes" apart from
// "three peers with 1 prefix each".
// If we receive multiple bootstrap prefixes from one peer, trim it
// down to just one. Connections on the bootstrap network are always
// authenticated via sprockets, which only needs one address.
prefixes.into_iter().find_map(|prefix| {
let mut segments = prefix.addr.segments();
if prefix.mask == BOOTSTRAP_MASK
Expand Down
6 changes: 2 additions & 4 deletions sled-agent/src/rack_setup/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -430,10 +430,8 @@ impl ServiceInner {
let ddm_admin_client = DdmAdminClient::new(self.log.clone())?;
let addrs = retry_notify(
// TODO-correctness `internal_service_policy()` has potentially-long
// exponential backoff; do we want a different infinitely-retrying
// policy that is more aggressive? This is similar to
// https://github.com/oxidecomputer/omicron/issues/996, which
// discusses this issue for disk creation.
// exponential backoff, which is probably not what we want. See
// https://github.com/oxidecomputer/omicron/issues/1270
internal_service_policy(),
|| async {
let peer_addrs =
Expand Down

0 comments on commit 6740bff

Please sign in to comment.