Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

retry datadog errors #1774

Merged
merged 3 commits into from
Nov 20, 2020
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 31 additions & 27 deletions priv/perf/apps/load_test/lib/service/datadog/api.ex
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,15 @@ defmodule LoadTest.Service.Datadog.API do
assert_metrics(environment, start_unix, end_unix)
end

def do_assert_metrics(environment, start_unix, end_unix, poll_count \\ 200) do
defp do_assert_metrics(environment, start_unix, end_unix, poll_count \\ 60)

defp do_assert_metrics(_environment, _start_unix, _end_unix, 0), do: :ok

defp do_assert_metrics(environment, start_unix, end_unix, poll_count) do
case fetch_events(start_unix, end_unix, environment) do
{:ok, []} ->
case poll_count do
0 ->
:ok

_ ->
Process.sleep(1_000)
do_assert_metrics(environment, start_unix, end_unix, poll_count - 1)
end
Process.sleep(1_000)
do_assert_metrics(environment, start_unix, end_unix, poll_count - 1)

{:ok, events} ->
# failed monitors with the same tags do not emit events, so we're resolving
Expand All @@ -60,7 +58,15 @@ defmodule LoadTest.Service.Datadog.API do
end
end

def fetch_events(start_time, end_time, environment) do
defp fetch_events(start_time, end_time, environment, retries \\ 5)

defp fetch_events(start_time, end_time, environment, 0) do
Logger.error("failed to fetch events #{inspect({start_time, end_time, environment})}")

{:error, :failed_to_fetch_event}
end

defp fetch_events(start_time, end_time, environment, retries) do
params = %{
start: start_time,
end: end_time,
Expand All @@ -80,10 +86,12 @@ defmodule LoadTest.Service.Datadog.API do
{:ok, events}

{:ok, %{body: body}} ->
{:error, body}
Logger.error("failed to fetch events #{inspect(body)}")
fetch_events(start_time, end_time, environment, retries - 1)

{:error, error} ->
{:error, error}
Logger.error("failed to fetch events #{inspect(error)}")
fetch_events(start_time, end_time, environment, retries - 1)
end
end

Expand Down Expand Up @@ -119,6 +127,12 @@ defmodule LoadTest.Service.Datadog.API do

defp do_resolve_monitors([], _), do: :ok

defp do_resolve_monitors(params, 0) do
Logger.error("failed to resolve monitors #{params}")

{:error, :failed_to_resolve_monitors}
end

defp do_resolve_monitors(params, retries) do
payload = Jason.encode!(%{"resolve" => params})

Expand All @@ -131,26 +145,16 @@ defmodule LoadTest.Service.Datadog.API do
{:ok, %{body: body}} ->
Logger.error("failed to resolve monitors #{inspect(body)}")

case retries do
0 ->
{:error, body}
Process.sleep(1_000)

_ ->
Process.sleep(1_000)
do_resolve_monitors(params, retries - 1)
end
do_resolve_monitors(params, retries - 1)

{:error, error} = other ->
{:error, error} ->
Logger.error("failed to resolve monitors #{inspect(error)}")
Copy link
Contributor

@boolafish boolafish Nov 20, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: I know is not part of this PR change. But might consider using warn level and add some words to indicate it is going to retry as it is still fighting to get out of the error.


case retries do
0 ->
other
Process.sleep(1_000)

_ ->
Process.sleep(1_000)
do_resolve_monitors(params, retries - 1)
end
do_resolve_monitors(params, retries - 1)
end
end

Expand Down