Skip to content

Commit

Permalink
Reset watch retry count on successful connection to API Server (#267)
Browse files Browse the repository at this point in the history
* Reset watch retry count on successful connection to API Server

* Reset pod watch retry count on successful connection to API Server

* Fix tests

* Re-add tests for Fluent::UnrecoverableError
  • Loading branch information
andrzej-stencel authored Jan 22, 2021
1 parent 84f66a8 commit d5fe9b7
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 31 deletions.
4 changes: 3 additions & 1 deletion lib/fluent/plugin/kubernetes_metadata_watch_namespaces.rb
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ def get_namespaces_and_start_watcher
# continue watching from most recent resourceVersion
options[:resource_version] = namespaces[:metadata][:resourceVersion]

@client.watch_namespaces(options)
watcher = @client.watch_namespaces(options)
reset_namespace_watch_retry_stats
watcher
end

# Reset namespace watch retry count and backoff interval as there is a
Expand Down
5 changes: 4 additions & 1 deletion lib/fluent/plugin/kubernetes_metadata_watch_pods.rb
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,10 @@ def get_pods_and_start_watcher
# continue watching from most recent resourceVersion
options[:resource_version] = pods[:metadata][:resourceVersion]
end
@client.watch_pods(options)

watcher = @client.watch_pods(options)
reset_pod_watch_retry_stats
watcher
end

# Reset pod watch retry count and backoff interval as there is a
Expand Down
50 changes: 36 additions & 14 deletions test/plugin/test_watch_namespaces.rb
Original file line number Diff line number Diff line change
Expand Up @@ -148,30 +148,52 @@ class WatchNamespacesTestTest < WatchTest
end
end

test 'namespace watch retries when exceptions are encountered' do
test 'namespace watch raises Fluent::UnrecoverableError when cannot re-establish connection to k8s API server' do
# Stub start_namespace_watch to simulate initial successful connection to API server
stub(self).start_namespace_watch
# Stub watch_namespaces to simluate not being able to set up watch connection to API server
stub(@client).watch_namespaces { raise }
@client.stub :get_namespaces, @initial do
assert_raise Fluent::UnrecoverableError do
set_up_namespace_thread
end
end
assert_equal(3, @stats[:namespace_watch_failures])
assert_equal(2, Thread.current[:namespace_watch_retry_count])
assert_equal(4, Thread.current[:namespace_watch_retry_backoff_interval])
assert_nil(@stats[:namespace_watch_error_type_notices])
end

test 'namespace watch resets watch retry count when exceptions are encountered and connection to k8s API server is re-established' do
@client.stub :get_namespaces, @initial do
@client.stub :watch_namespaces, [[@created, @exception_raised]] do
assert_raise Fluent::UnrecoverableError do
set_up_namespace_thread
# Force the infinite watch loop to exit after 3 seconds. Verifies that
# no unrecoverable error was thrown during this period of time.
assert_raise Timeout::Error.new('execution expired') do
Timeout.timeout(3) do
set_up_namespace_thread
end
end
assert_equal(3, @stats[:namespace_watch_failures])
assert_equal(2, Thread.current[:namespace_watch_retry_count])
assert_equal(4, Thread.current[:namespace_watch_retry_backoff_interval])
assert_nil(@stats[:namespace_watch_error_type_notices])
assert_operator(@stats[:namespace_watch_failures], :>=, 3)
assert_operator(Thread.current[:namespace_watch_retry_count], :<=, 1)
assert_operator(Thread.current[:namespace_watch_retry_backoff_interval], :<=, 1)
end
end
end

test 'namespace watch retries when error is received' do
test 'namespace watch resets watch retry count when error is received and connection to k8s API server is re-established' do
@client.stub :get_namespaces, @initial do
@client.stub :watch_namespaces, [@error] do
assert_raise Fluent::UnrecoverableError do
set_up_namespace_thread
# Force the infinite watch loop to exit after 3 seconds. Verifies that
# no unrecoverable error was thrown during this period of time.
assert_raise Timeout::Error.new('execution expired') do
Timeout.timeout(3) do
set_up_namespace_thread
end
end
assert_equal(3, @stats[:namespace_watch_failures])
assert_equal(2, Thread.current[:namespace_watch_retry_count])
assert_equal(4, Thread.current[:namespace_watch_retry_backoff_interval])
assert_equal(3, @stats[:namespace_watch_error_type_notices])
assert_operator(@stats[:namespace_watch_failures], :>=, 3)
assert_operator(Thread.current[:namespace_watch_retry_count], :<=, 1)
assert_operator(Thread.current[:namespace_watch_retry_backoff_interval], :<=, 1)
end
end
end
Expand Down
53 changes: 38 additions & 15 deletions test/plugin/test_watch_pods.rb
Original file line number Diff line number Diff line change
Expand Up @@ -234,30 +234,53 @@ class DefaultPodWatchStrategyTest < WatchTest
end
end

test 'pod watch retries when exceptions are encountered' do
test 'pod watch raises Fluent::UnrecoverableError when cannot re-establish connection to k8s API server' do
# Stub start_pod_watch to simulate initial successful connection to API server
stub(self).start_pod_watch
# Stub watch_pods to simluate not being able to set up watch connection to API server
stub(@client).watch_pods { raise }
@client.stub :get_pods, @initial do
assert_raise Fluent::UnrecoverableError do
set_up_pod_thread
end
end
assert_equal(3, @stats[:pod_watch_failures])
assert_equal(2, Thread.current[:pod_watch_retry_count])
assert_equal(4, Thread.current[:pod_watch_retry_backoff_interval])
assert_nil(@stats[:pod_watch_error_type_notices])
end

test 'pod watch resets watch retry count when exceptions are encountered and connection to k8s API server is re-established' do
@client.stub :get_pods, @initial do
@client.stub :watch_pods, [[@created, @exception_raised]] do
assert_raise Fluent::UnrecoverableError do
set_up_pod_thread
end
assert_equal(3, @stats[:pod_watch_failures])
assert_equal(2, Thread.current[:pod_watch_retry_count])
assert_equal(4, Thread.current[:pod_watch_retry_backoff_interval])
assert_nil(@stats[:pod_watch_error_type_notices])
# Force the infinite watch loop to exit after 3 seconds. Verifies that
# no unrecoverable error was thrown during this period of time.
assert_raise Timeout::Error.new('execution expired') do
Timeout.timeout(3) do
set_up_pod_thread
end
end
assert_operator(@stats[:pod_watch_failures], :>=, 3)
assert_operator(Thread.current[:pod_watch_retry_count], :<=, 1)
assert_operator(Thread.current[:pod_watch_retry_backoff_interval], :<=, 1)
end
end
end

test 'pod watch retries when error is received' do
test 'pod watch resets watch retry count when error is received and connection to k8s API server is re-established' do
@client.stub :get_pods, @initial do
@client.stub :watch_pods, [@error] do
assert_raise Fluent::UnrecoverableError do
set_up_pod_thread
# Force the infinite watch loop to exit after 3 seconds. Verifies that
# no unrecoverable error was thrown during this period of time.
assert_raise Timeout::Error.new('execution expired') do
Timeout.timeout(3) do
set_up_pod_thread
end
end
assert_equal(3, @stats[:pod_watch_failures])
assert_equal(2, Thread.current[:pod_watch_retry_count])
assert_equal(4, Thread.current[:pod_watch_retry_backoff_interval])
assert_equal(3, @stats[:pod_watch_error_type_notices])
assert_operator(@stats[:pod_watch_failures], :>=, 3)
assert_operator(Thread.current[:pod_watch_retry_count], :<=, 1)
assert_operator(Thread.current[:pod_watch_retry_backoff_interval], :<=, 1)
assert_operator(@stats[:pod_watch_error_type_notices], :>=, 3)
end
end
end
Expand Down

0 comments on commit d5fe9b7

Please sign in to comment.