Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reset watch retry count on successful connection to API Server #267

Merged
merged 4 commits into from
Jan 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion lib/fluent/plugin/kubernetes_metadata_watch_namespaces.rb
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ def get_namespaces_and_start_watcher
# continue watching from most recent resourceVersion
options[:resource_version] = namespaces[:metadata][:resourceVersion]

@client.watch_namespaces(options)
watcher = @client.watch_namespaces(options)
reset_namespace_watch_retry_stats
watcher
end

# Reset namespace watch retry count and backoff interval as there is a
Expand Down
5 changes: 4 additions & 1 deletion lib/fluent/plugin/kubernetes_metadata_watch_pods.rb
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,10 @@ def get_pods_and_start_watcher
# continue watching from most recent resourceVersion
options[:resource_version] = pods[:metadata][:resourceVersion]
end
@client.watch_pods(options)

watcher = @client.watch_pods(options)
reset_pod_watch_retry_stats
watcher
end

# Reset pod watch retry count and backoff interval as there is a
Expand Down
50 changes: 36 additions & 14 deletions test/plugin/test_watch_namespaces.rb
Original file line number Diff line number Diff line change
Expand Up @@ -148,30 +148,52 @@ class WatchNamespacesTestTest < WatchTest
end
end

test 'namespace watch retries when exceptions are encountered' do
test 'namespace watch raises Fluent::UnrecoverableError when cannot re-establish connection to k8s API server' do
# Stub start_namespace_watch to simulate initial successful connection to API server
stub(self).start_namespace_watch
# Stub watch_namespaces to simluate not being able to set up watch connection to API server
stub(@client).watch_namespaces { raise }
@client.stub :get_namespaces, @initial do
assert_raise Fluent::UnrecoverableError do
set_up_namespace_thread
end
end
assert_equal(3, @stats[:namespace_watch_failures])
assert_equal(2, Thread.current[:namespace_watch_retry_count])
assert_equal(4, Thread.current[:namespace_watch_retry_backoff_interval])
assert_nil(@stats[:namespace_watch_error_type_notices])
end

test 'namespace watch resets watch retry count when exceptions are encountered and connection to k8s API server is re-established' do
@client.stub :get_namespaces, @initial do
@client.stub :watch_namespaces, [[@created, @exception_raised]] do
assert_raise Fluent::UnrecoverableError do
set_up_namespace_thread
# Force the infinite watch loop to exit after 3 seconds. Verifies that
# no unrecoverable error was thrown during this period of time.
assert_raise Timeout::Error.new('execution expired') do
Timeout.timeout(3) do
set_up_namespace_thread
end
end
assert_equal(3, @stats[:namespace_watch_failures])
assert_equal(2, Thread.current[:namespace_watch_retry_count])
assert_equal(4, Thread.current[:namespace_watch_retry_backoff_interval])
assert_nil(@stats[:namespace_watch_error_type_notices])
assert_operator(@stats[:namespace_watch_failures], :>=, 3)
assert_operator(Thread.current[:namespace_watch_retry_count], :<=, 1)
assert_operator(Thread.current[:namespace_watch_retry_backoff_interval], :<=, 1)
end
end
end

test 'namespace watch retries when error is received' do
test 'namespace watch resets watch retry count when error is received and connection to k8s API server is re-established' do
@client.stub :get_namespaces, @initial do
@client.stub :watch_namespaces, [@error] do
assert_raise Fluent::UnrecoverableError do
set_up_namespace_thread
# Force the infinite watch loop to exit after 3 seconds. Verifies that
# no unrecoverable error was thrown during this period of time.
assert_raise Timeout::Error.new('execution expired') do
Timeout.timeout(3) do
set_up_namespace_thread
end
end
assert_equal(3, @stats[:namespace_watch_failures])
assert_equal(2, Thread.current[:namespace_watch_retry_count])
assert_equal(4, Thread.current[:namespace_watch_retry_backoff_interval])
assert_equal(3, @stats[:namespace_watch_error_type_notices])
assert_operator(@stats[:namespace_watch_failures], :>=, 3)
assert_operator(Thread.current[:namespace_watch_retry_count], :<=, 1)
assert_operator(Thread.current[:namespace_watch_retry_backoff_interval], :<=, 1)
end
end
end
Expand Down
53 changes: 38 additions & 15 deletions test/plugin/test_watch_pods.rb
Original file line number Diff line number Diff line change
Expand Up @@ -234,30 +234,53 @@ class DefaultPodWatchStrategyTest < WatchTest
end
end

test 'pod watch retries when exceptions are encountered' do
test 'pod watch raises Fluent::UnrecoverableError when cannot re-establish connection to k8s API server' do
# Stub start_pod_watch to simulate initial successful connection to API server
stub(self).start_pod_watch
# Stub watch_pods to simluate not being able to set up watch connection to API server
stub(@client).watch_pods { raise }
@client.stub :get_pods, @initial do
assert_raise Fluent::UnrecoverableError do
set_up_pod_thread
end
end
assert_equal(3, @stats[:pod_watch_failures])
assert_equal(2, Thread.current[:pod_watch_retry_count])
assert_equal(4, Thread.current[:pod_watch_retry_backoff_interval])
assert_nil(@stats[:pod_watch_error_type_notices])
end

test 'pod watch resets watch retry count when exceptions are encountered and connection to k8s API server is re-established' do
andrzej-stencel marked this conversation as resolved.
Show resolved Hide resolved
@client.stub :get_pods, @initial do
@client.stub :watch_pods, [[@created, @exception_raised]] do
assert_raise Fluent::UnrecoverableError do
set_up_pod_thread
end
assert_equal(3, @stats[:pod_watch_failures])
assert_equal(2, Thread.current[:pod_watch_retry_count])
assert_equal(4, Thread.current[:pod_watch_retry_backoff_interval])
assert_nil(@stats[:pod_watch_error_type_notices])
# Force the infinite watch loop to exit after 3 seconds. Verifies that
# no unrecoverable error was thrown during this period of time.
assert_raise Timeout::Error.new('execution expired') do
Timeout.timeout(3) do
set_up_pod_thread
end
end
assert_operator(@stats[:pod_watch_failures], :>=, 3)
assert_operator(Thread.current[:pod_watch_retry_count], :<=, 1)
assert_operator(Thread.current[:pod_watch_retry_backoff_interval], :<=, 1)
end
end
end

test 'pod watch retries when error is received' do
test 'pod watch resets watch retry count when error is received and connection to k8s API server is re-established' do
@client.stub :get_pods, @initial do
@client.stub :watch_pods, [@error] do
assert_raise Fluent::UnrecoverableError do
set_up_pod_thread
# Force the infinite watch loop to exit after 3 seconds. Verifies that
# no unrecoverable error was thrown during this period of time.
assert_raise Timeout::Error.new('execution expired') do
Timeout.timeout(3) do
set_up_pod_thread
end
end
assert_equal(3, @stats[:pod_watch_failures])
assert_equal(2, Thread.current[:pod_watch_retry_count])
assert_equal(4, Thread.current[:pod_watch_retry_backoff_interval])
assert_equal(3, @stats[:pod_watch_error_type_notices])
assert_operator(@stats[:pod_watch_failures], :>=, 3)
assert_operator(Thread.current[:pod_watch_retry_count], :<=, 1)
assert_operator(Thread.current[:pod_watch_retry_backoff_interval], :<=, 1)
assert_operator(@stats[:pod_watch_error_type_notices], :>=, 3)
end
end
end
Expand Down