Skip to content

Commit

Permalink
dns: fix exception unsafe behavior in c-ares callbacks. (envoyproxy#4307
Browse files Browse the repository at this point in the history
)

Previously, we were taking an exception due to the late validation of update_merge_window duration
in ClusterManagerImpl::scheduleUpdate, which happened under a c-ares strict DNS host resolution
callback. There are several related issues here:

1. c-ares is exception unsafe, see c-ares/c-ares#219.
2. We should be validating Durations with PGV, see
   bufbuild/protoc-gen-validate#97.
3. We should defer the c-ares resolution callbacks to be outside the c-ares callback context for
   exception safety.

This PR addresses (3) by moving callbacks, even when they are "immediate", to a dispatcher post, so
that we never take an exception under a c-ares callback.

A workaround for (2) is provided, in lieu of bufbuild/protoc-gen-validate#97,
which is blocked on our ability to bump PGV version in Envoy, see
lyft/protoc-gen-star#28.

Fixes oss-fuzz issue https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=9868.

Risk level: Medium (DNS clusters will have some timing changes).
Testing: Updated DNS implementation unit tests, server fuzz corpus entry added.

Signed-off-by: Harvey Tuch <htuch@google.com>
  • Loading branch information
htuch authored Aug 31, 2018
1 parent 1212423 commit 1d34172
Showing 8 changed files with 199 additions and 139 deletions.
3 changes: 2 additions & 1 deletion include/envoy/network/dns.h
Original file line number Diff line number Diff line change
@@ -38,7 +38,8 @@ class DnsResolver {
* @param address_list supplies the list of resolved IP addresses. The list will be empty if
* the resolution failed.
*/
typedef std::function<void(std::list<Address::InstanceConstSharedPtr>&& address_list)> ResolveCb;
typedef std::function<void(const std::list<Address::InstanceConstSharedPtr>&& address_list)>
ResolveCb;

/**
* Initiate an async DNS resolution.
5 changes: 3 additions & 2 deletions source/common/network/dns_impl.cc
Original file line number Diff line number Diff line change
@@ -120,7 +120,8 @@ void DnsResolverImpl::PendingResolution::onAresHostCallback(int status, int time

if (completed_) {
if (!cancelled_) {
callback_(std::move(address_list));
dispatcher_.post(
[callback = callback_, al = std::move(address_list)] { callback(std::move(al)); });
}
if (owned_) {
delete this;
@@ -185,7 +186,7 @@ ActiveDnsQuery* DnsResolverImpl::resolve(const std::string& dns_name,
// failed intial call to getHostbyName followed by a synchronous IPv4
// resolution.
std::unique_ptr<PendingResolution> pending_resolution(
new PendingResolution(callback, channel_, dns_name));
new PendingResolution(callback, dispatcher_, channel_, dns_name));
if (dns_lookup_family == DnsLookupFamily::Auto) {
pending_resolution->fallback_if_failed_ = true;
}
7 changes: 5 additions & 2 deletions source/common/network/dns_impl.h
Original file line number Diff line number Diff line change
@@ -39,8 +39,9 @@ class DnsResolverImpl : public DnsResolver, protected Logger::Loggable<Logger::I
friend class DnsResolverImplPeer;
struct PendingResolution : public ActiveDnsQuery {
// Network::ActiveDnsQuery
PendingResolution(ResolveCb callback, ares_channel channel, const std::string& dns_name)
: callback_(callback), channel_(channel), dns_name_(dns_name) {}
PendingResolution(ResolveCb callback, Event::Dispatcher& dispatcher, ares_channel channel,
const std::string& dns_name)
: callback_(callback), dispatcher_(dispatcher), channel_(channel), dns_name_(dns_name) {}

void cancel() override {
// c-ares only supports channel-wide cancellation, so we just allow the
@@ -63,6 +64,8 @@ class DnsResolverImpl : public DnsResolver, protected Logger::Loggable<Logger::I

// Caller supplied callback to invoke on query completion or error.
const ResolveCb callback_;
// Dispatcher to post callback_ to.
Event::Dispatcher& dispatcher_;
// Does the object own itself? Resource reclamation occurs via self-deleting
// on query completion or error.
bool owned_ = false;
4 changes: 2 additions & 2 deletions source/common/upstream/logical_dns_cluster.cc
Original file line number Diff line number Diff line change
@@ -82,8 +82,8 @@ void LogicalDnsCluster::startResolve() {

active_dns_query_ = dns_resolver_->resolve(
dns_address, dns_lookup_family_,
[this,
dns_address](std::list<Network::Address::InstanceConstSharedPtr>&& address_list) -> void {
[this, dns_address](
const std::list<Network::Address::InstanceConstSharedPtr>&& address_list) -> void {
active_dns_query_ = nullptr;
ENVOY_LOG(debug, "async DNS resolution complete for {}", dns_address);
info_->stats().update_success_.inc();
7 changes: 6 additions & 1 deletion source/common/upstream/upstream_impl.cc
Original file line number Diff line number Diff line change
@@ -373,6 +373,11 @@ ClusterInfoImpl::ClusterInfoImpl(const envoy::api::v2::Cluster& config,
idle_timeout_ = std::chrono::milliseconds(
DurationUtil::durationToMilliseconds(config.common_http_protocol_options().idle_timeout()));
}

// TODO(htuch): Remove this temporary workaround when we have
// https://github.com/lyft/protoc-gen-validate/issues/97 resolved. This just provides early
// validation of sanity of fields that we should catch at config ingestion.
DurationUtil::durationToMilliseconds(common_lb_config_.update_merge_window());
}

ProtocolOptionsConfigConstSharedPtr
@@ -1137,7 +1142,7 @@ void StrictDnsClusterImpl::ResolveTarget::startResolve() {

active_query_ = parent_.dns_resolver_->resolve(
dns_address_, parent_.dns_lookup_family_,
[this](std::list<Network::Address::InstanceConstSharedPtr>&& address_list) -> void {
[this](const std::list<Network::Address::InstanceConstSharedPtr>&& address_list) -> void {
active_query_ = nullptr;
ENVOY_LOG(debug, "async DNS resolution complete for {}", dns_address_);
parent_.info_->stats().update_success_.inc();
265 changes: 135 additions & 130 deletions test/common/network/dns_impl_test.cc

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
node {
locality {
}
}
static_resources {
clusters {
name: "x"
type: STRICT_DNS
connect_timeout {
nanos: 250000000
}
lb_policy: RING_HASH
hosts {
socket_address {
address: "123.1.0.1"
named_port: "x"
}
}
hosts {
socket_address {
address: "127.0.0.2"
named_port: "3"
}
}
common_lb_config {
update_merge_window {
seconds: 281474976710656
}
}
}
}
admin {
access_log_path: "@-"
address {
pipe {
path: " "
}
}
}
8 changes: 7 additions & 1 deletion test/server/server_fuzz_test.cc
Original file line number Diff line number Diff line change
@@ -74,15 +74,21 @@ DEFINE_PROTO_FUZZER(const envoy::config::bootstrap::v2::Bootstrap& input) {
options.log_level_ = Fuzz::Runner::logLevel();
}

std::unique_ptr<InstanceImpl> server;
try {
auto server = std::make_unique<InstanceImpl>(
server = std::make_unique<InstanceImpl>(
options, test_time.timeSource(),
std::make_shared<Network::Address::Ipv4Instance>("127.0.0.1"), hooks, restart, stats_store,
fakelock, component_factory, std::make_unique<Runtime::RandomGeneratorImpl>(),
thread_local_instance);
} catch (const EnvoyException& ex) {
ENVOY_LOG_MISC(debug, "Controlled EnvoyException exit: {}", ex.what());
return;
}
// If we were successful, run any pending events on the main thread's dispatcher loop. These might
// be, for example, pending DNS resolution callbacks. If they generate exceptions, we want to
// explode and fail the test, hence we do this outside of the try-catch above.
server->dispatcher().run(Event::Dispatcher::RunType::NonBlock);
}

} // namespace Server

0 comments on commit 1d34172

Please sign in to comment.