Skip to content

Commit

Permalink
Availability: Adds retry on 408s to gateway calls for metadata operat…
Browse files Browse the repository at this point in the history
…ions (#3089)

This applies the same retry logic for a 408 from gateway that is already applied if the a timeout occurs trying to connect to the gateway. This will improve availability to customers because the SDK will not automatically retry on a timeout from gateway for metadata operation. This will only be used for cache operations where multiple requests are waiting on the result and a single timeout would result in multiple failures.
  • Loading branch information
j82w authored Mar 16, 2022
1 parent e67cf3f commit 84c1145
Show file tree
Hide file tree
Showing 7 changed files with 203 additions and 5 deletions.
26 changes: 21 additions & 5 deletions Microsoft.Azure.Cosmos/src/HttpClient/CosmosHttpClientCore.cs
Original file line number Diff line number Diff line change
Expand Up @@ -257,17 +257,24 @@ private async Task<HttpResponseMessage> SendHttpHelperAsync(
datum.RecordHttpResponse(requestMessage, responseMessage, resourceType, requestStartTime);
}

return responseMessage;
if (!timeoutPolicy.ShouldRetryBasedOnResponse(requestMessage.Method, responseMessage))
{
return responseMessage;
}

bool isOutOfRetries = CosmosHttpClientCore.IsOutOfRetries(timeoutPolicy, startDateTimeUtc, timeoutEnumerator);
if (isOutOfRetries)
{
return responseMessage;
}
}
catch (Exception e)
{
if (clientSideRequestStatistics is ClientSideRequestStatisticsTraceDatum datum)
{
datum.RecordHttpException(requestMessage, e, resourceType, requestStartTime);
}

bool isOutOfRetries = (DateTime.UtcNow - startDateTimeUtc) > timeoutPolicy.MaximumRetryTimeLimit || // Maximum of time for all retries
!timeoutEnumerator.MoveNext(); // No more retries are configured
bool isOutOfRetries = CosmosHttpClientCore.IsOutOfRetries(timeoutPolicy, startDateTimeUtc, timeoutEnumerator);

switch (e)
{
Expand Down Expand Up @@ -308,7 +315,7 @@ private async Task<HttpResponseMessage> SendHttpHelperAsync(
throw;
}
}

}

if (delayForNextRequest != TimeSpan.Zero)
Expand All @@ -318,6 +325,15 @@ private async Task<HttpResponseMessage> SendHttpHelperAsync(
}
}

private static bool IsOutOfRetries(
HttpTimeoutPolicy timeoutPolicy,
DateTime startDateTimeUtc,
IEnumerator<(TimeSpan requestTimeout, TimeSpan delayForNextRequest)> timeoutEnumerator)
{
return (DateTime.UtcNow - startDateTimeUtc) > timeoutPolicy.MaximumRetryTimeLimit || // Maximum of time for all retries
!timeoutEnumerator.MoveNext(); // No more retries are configured
}

private async Task<HttpResponseMessage> ExecuteHttpHelperAsync(
HttpRequestMessage requestMessage,
ResourceType resourceType,
Expand Down
2 changes: 2 additions & 0 deletions Microsoft.Azure.Cosmos/src/HttpClient/HttpTimeoutPolicy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ internal abstract class HttpTimeoutPolicy
public abstract IEnumerator<(TimeSpan requestTimeout, TimeSpan delayForNextRequest)> GetTimeoutEnumerator();
public abstract bool IsSafeToRetry(HttpMethod httpMethod);

public abstract bool ShouldRetryBasedOnResponse(HttpMethod requestHttpMethod, HttpResponseMessage responseMessage);

public static HttpTimeoutPolicy GetTimeoutPolicy(
DocumentServiceRequest documentServiceRequest)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,10 @@ public override bool IsSafeToRetry(HttpMethod httpMethod)
{
return true;
}

public override bool ShouldRetryBasedOnResponse(HttpMethod requestHttpMethod, HttpResponseMessage responseMessage)
{
return false;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,25 @@ public override bool IsSafeToRetry(HttpMethod httpMethod)
{
return true;
}

public override bool ShouldRetryBasedOnResponse(HttpMethod requestHttpMethod, HttpResponseMessage responseMessage)
{
if (responseMessage == null)
{
return false;
}

if (responseMessage.StatusCode != System.Net.HttpStatusCode.RequestTimeout)
{
return false;
}

if (!this.IsSafeToRetry(requestHttpMethod))
{
return false;
}

return true;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,10 @@ public override bool IsSafeToRetry(HttpMethod httpMethod)
{
return httpMethod == HttpMethod.Get;
}

public override bool ShouldRetryBasedOnResponse(HttpMethod requestHttpMethod, HttpResponseMessage responseMessage)
{
return false;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,55 @@ public async Task VerifyPkRangeCacheRefreshOnThrottlesAsync()
Assert.AreEqual(0, ifNoneMatchValues.Count(x => string.IsNullOrEmpty(x)), "The cache is already init. It should never re-initialize the cache.");
}

[TestMethod]
public async Task VerifyPkRangeCacheRefreshOnTimeoutsAsync()
{
int pkRangeCalls = 0;
HttpClientHandlerHelper httpHandlerHelper = new();
List<string> ifNoneMatchValues = new();
httpHandlerHelper.RequestCallBack = (request, cancellationToken) =>
{
if (!request.RequestUri.ToString().EndsWith("pkranges"))
{
return null;
}
ifNoneMatchValues.Add(request.Headers.IfNoneMatch.ToString());
pkRangeCalls++;
// Cause timeout on the init
if (pkRangeCalls == 1)
{
return Task.FromResult(new HttpResponseMessage(HttpStatusCode.RequestTimeout));
}
return null;
};

CosmosClientOptions clientOptions = new CosmosClientOptions()
{
HttpClientFactory = () => new HttpClient(httpHandlerHelper),
};

CosmosClient resourceClient = TestCommon.CreateCosmosClient(clientOptions);

string dbName = Guid.NewGuid().ToString();
string containerName = nameof(PartitionKeyRangeCacheTests);

Database db = await resourceClient.CreateDatabaseIfNotExistsAsync(dbName);
Container container = await db.CreateContainerIfNotExistsAsync(
containerName,
"/pk",
400);

ToDoActivity toDoActivity = ToDoActivity.CreateRandomToDoActivity();
await container.CreateItemAsync<ToDoActivity>(toDoActivity);
Assert.AreEqual(3, pkRangeCalls);

Assert.AreEqual(2, ifNoneMatchValues.Count(x => string.IsNullOrEmpty(x)), "First call is a 408");
}

[TestMethod]
public async Task TestRidRefreshOnNotFoundAsync()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,107 @@ async Task<HttpResponseMessage> sendFunc(HttpRequestMessage request, Cancellatio
}
}

[TestMethod]
public void VerifyShouldRetryOnResponseTest()
{
foreach (HttpStatusCode statusCode in Enum.GetValues(typeof(HttpStatusCode)))
{
HttpResponseMessage responseMessage = new HttpResponseMessage(statusCode);
Assert.IsFalse(HttpTimeoutPolicyDefault.Instance.ShouldRetryBasedOnResponse(HttpMethod.Get, responseMessage));
Assert.IsFalse(HttpTimeoutPolicyControlPlaneRead.Instance.ShouldRetryBasedOnResponse(HttpMethod.Get, responseMessage));

Assert.IsFalse(HttpTimeoutPolicyDefault.Instance.ShouldRetryBasedOnResponse(HttpMethod.Put, responseMessage));
Assert.IsFalse(HttpTimeoutPolicyControlPlaneRead.Instance.ShouldRetryBasedOnResponse(HttpMethod.Put, responseMessage));

if (statusCode == HttpStatusCode.RequestTimeout)
{
Assert.IsTrue(HttpTimeoutPolicyControlPlaneRetriableHotPath.Instance.ShouldRetryBasedOnResponse(HttpMethod.Get, responseMessage));
Assert.IsTrue(HttpTimeoutPolicyControlPlaneRetriableHotPath.Instance.ShouldRetryBasedOnResponse(HttpMethod.Put, responseMessage));
}
else
{
Assert.IsFalse(HttpTimeoutPolicyControlPlaneRetriableHotPath.Instance.ShouldRetryBasedOnResponse(HttpMethod.Get, responseMessage));
Assert.IsFalse(HttpTimeoutPolicyControlPlaneRetriableHotPath.Instance.ShouldRetryBasedOnResponse(HttpMethod.Put, responseMessage));
}
}
}

[TestMethod]
public async Task RetryTransient408sTestAsync()
{
int count = 0;
async Task<HttpResponseMessage> sendFunc(HttpRequestMessage request, CancellationToken cancellationToken)
{
count++;

if (count <= 2)
{
await Task.Delay(TimeSpan.FromMilliseconds(10));
return new HttpResponseMessage(HttpStatusCode.RequestTimeout);
}

if (count == 3)
{
await Task.Delay(TimeSpan.FromMilliseconds(10));
return new HttpResponseMessage(HttpStatusCode.OK);
}

throw new Exception("Should not return after the success");
}

DocumentClientEventSource eventSource = DocumentClientEventSource.Instance;
HttpMessageHandler messageHandler = new MockMessageHandler(sendFunc);
using CosmosHttpClient cosmoshttpClient = MockCosmosUtil.CreateCosmosHttpClient(() => new HttpClient(messageHandler));

HttpResponseMessage responseMessage = await cosmoshttpClient.SendHttpAsync(() =>
new ValueTask<HttpRequestMessage>(
result: new HttpRequestMessage(HttpMethod.Get, new Uri("http://localhost"))),
resourceType: ResourceType.Collection,
timeoutPolicy: HttpTimeoutPolicyControlPlaneRetriableHotPath.Instance,
clientSideRequestStatistics: new ClientSideRequestStatisticsTraceDatum(DateTime.UtcNow),
cancellationToken: default);

Assert.AreEqual(HttpStatusCode.OK, responseMessage.StatusCode);
}

[TestMethod]
public async Task DoesNotRetryTransient408sOnDefaultPolicyTestAsync()
{
int count = 0;
async Task<HttpResponseMessage> sendFunc(HttpRequestMessage request, CancellationToken cancellationToken)
{
count++;

if (count <= 2)
{
await Task.Delay(TimeSpan.FromMilliseconds(10));
return new HttpResponseMessage(HttpStatusCode.RequestTimeout);
}

if (count == 3)
{
await Task.Delay(TimeSpan.FromMilliseconds(10));
return new HttpResponseMessage(HttpStatusCode.OK);
}

throw new Exception("Should not return after the success");
}

DocumentClientEventSource eventSource = DocumentClientEventSource.Instance;
HttpMessageHandler messageHandler = new MockMessageHandler(sendFunc);
using CosmosHttpClient cosmoshttpClient = MockCosmosUtil.CreateCosmosHttpClient(() => new HttpClient(messageHandler));

HttpResponseMessage responseMessage = await cosmoshttpClient.SendHttpAsync(() =>
new ValueTask<HttpRequestMessage>(
result: new HttpRequestMessage(HttpMethod.Get, new Uri("http://localhost"))),
resourceType: ResourceType.Collection,
timeoutPolicy: HttpTimeoutPolicyControlPlaneRead.Instance,
clientSideRequestStatistics: new ClientSideRequestStatisticsTraceDatum(DateTime.UtcNow),
cancellationToken: default);

Assert.AreEqual(HttpStatusCode.RequestTimeout, responseMessage.StatusCode, "Should be a request timeout");
}

[TestMethod]
public async Task RetryTransientIssuesForQueryPlanTestAsync()
{
Expand Down

0 comments on commit 84c1145

Please sign in to comment.