From ab2044635922dab9fd3f566f539f7d2e65d4bb75 Mon Sep 17 00:00:00 2001 From: Nick Mills-Barrett Date: Tue, 8 Mar 2022 17:40:22 +0000 Subject: [PATCH 1/5] Retry replication HTTP requests that fail to connect This allows for the target process to be down for around a minute which provides time for restarts during synapse upgrades/config updates. --- synapse/replication/http/_base.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py index 2e697c74a6bb..c6c9b0d56010 100644 --- a/synapse/replication/http/_base.py +++ b/synapse/replication/http/_base.py @@ -22,6 +22,7 @@ from prometheus_client import Counter, Gauge from twisted.web.server import Request +from twisted.internet.error import ConnectError, DNSLookupError from synapse.api.errors import HttpResponseException, SynapseError from synapse.http import RequestTimedOutError @@ -94,6 +95,8 @@ class ReplicationEndpoint(metaclass=abc.ABCMeta): METHOD = "POST" CACHE = True RETRY_ON_TIMEOUT = True + RETRY_ON_CONNECT_ERROR = True + RETRY_ON_CONNECT_ERROR_ATTEMPTS = 5 # =63s (2^6-1) def __init__(self, hs: "HomeServer"): if self.CACHE: @@ -237,6 +240,9 @@ async def send_request(*, instance_name: str = "master", **kwargs: Any) -> Any: ) try: + # Keep track of attempts made so we can bail if we don't manage to + # connect to the target after N tries. + attempts = 0 # We keep retrying the same request for timeouts. This is so that we # have a good idea that the request has either succeeded or failed # on the master, and so whether we should clean up or not. @@ -255,11 +261,21 @@ async def send_request(*, instance_name: str = "master", **kwargs: Any) -> Any: if not cls.RETRY_ON_TIMEOUT: raise - logger.warning("%s request timed out; retrying", cls.NAME) + logger.warning("%s request timed out; retrying", cls.NAME) - # If we timed out we probably don't need to worry about backing - # off too much, but lets just wait a little anyway. - await clock.sleep(1) + # If we timed out we probably don't need to worry about backing + # off too much, but lets just wait a little anyway. + await clock.sleep(1) + except (ConnectError, DNSLookupError): + if not cls.RETRY_ON_CONNECT_ERROR: + raise + if attempts > cls.RETRY_ON_CONNECT_ERROR_ATTEMPTS: + raise + + logger.warning("%s request connection failed; retrying", cls.NAME) + + await clock.sleep(2 ** attempts) + attempts += 1 except HttpResponseException as e: # We convert to SynapseError as we know that it was a SynapseError # on the main process that we should send to the client. (And From 4dc23f5c2c44ae8f52ad0a04ed0acc5ac80540b3 Mon Sep 17 00:00:00 2001 From: Nick Mills-Barrett Date: Tue, 8 Mar 2022 17:41:02 +0000 Subject: [PATCH 2/5] Move header dict creation out of the request loop Tiny optimisation / code readability improvement. --- synapse/replication/http/_base.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py index c6c9b0d56010..4aca8e7a1bbe 100644 --- a/synapse/replication/http/_base.py +++ b/synapse/replication/http/_base.py @@ -21,8 +21,8 @@ from prometheus_client import Counter, Gauge -from twisted.web.server import Request from twisted.internet.error import ConnectError, DNSLookupError +from twisted.web.server import Request from synapse.api.errors import HttpResponseException, SynapseError from synapse.http import RequestTimedOutError @@ -239,6 +239,12 @@ async def send_request(*, instance_name: str = "master", **kwargs: Any) -> Any: "/".join(url_args), ) + headers: Dict[bytes, List[bytes]] = {} + # Add an authorization header, if configured. + if replication_secret: + headers[b"Authorization"] = [b"Bearer " + replication_secret] + opentracing.inject_header_dict(headers, check_destination=False) + try: # Keep track of attempts made so we can bail if we don't manage to # connect to the target after N tries. @@ -247,13 +253,6 @@ async def send_request(*, instance_name: str = "master", **kwargs: Any) -> Any: # have a good idea that the request has either succeeded or failed # on the master, and so whether we should clean up or not. while True: - headers: Dict[bytes, List[bytes]] = {} - # Add an authorization header, if configured. - if replication_secret: - headers[b"Authorization"] = [ - b"Bearer " + replication_secret - ] - opentracing.inject_header_dict(headers, check_destination=False) try: result = await request_func(uri, data, headers=headers) break @@ -272,7 +271,9 @@ async def send_request(*, instance_name: str = "master", **kwargs: Any) -> Any: if attempts > cls.RETRY_ON_CONNECT_ERROR_ATTEMPTS: raise - logger.warning("%s request connection failed; retrying", cls.NAME) + logger.warning( + "%s request connection failed; retrying", cls.NAME + ) await clock.sleep(2 ** attempts) attempts += 1 From 5c4e815bfca59c0616075630aa4d04626a9ce5c8 Mon Sep 17 00:00:00 2001 From: Nick Mills-Barrett Date: Tue, 8 Mar 2022 17:51:29 +0000 Subject: [PATCH 3/5] Add changelog file --- changelog.d/12182.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/12182.misc diff --git a/changelog.d/12182.misc b/changelog.d/12182.misc new file mode 100644 index 000000000000..7e9ad2c75244 --- /dev/null +++ b/changelog.d/12182.misc @@ -0,0 +1 @@ +Retry HTTP replication failures, this should prevent 502's when restarting stateful workers (main, event persisters, stream writers). Contributed by Nick @ Beeper. From bde45663d04bc04902654abe457faaabb97ac136 Mon Sep 17 00:00:00 2001 From: Nick Mills-Barrett Date: Wed, 9 Mar 2022 14:25:19 +0000 Subject: [PATCH 4/5] Add retry on connect error variables to docstring --- synapse/replication/http/_base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py index 4aca8e7a1bbe..736d0121164b 100644 --- a/synapse/replication/http/_base.py +++ b/synapse/replication/http/_base.py @@ -88,6 +88,10 @@ class ReplicationEndpoint(metaclass=abc.ABCMeta): `_handle_request` must return a Deferred. RETRY_ON_TIMEOUT(bool): Whether or not to retry the request when a 504 is received. + RETRY_ON_CONNECT_ERROR (bool): Whether or not to retry the request when + a connection error is received. + RETRY_ON_CONNECT_ERROR_ATTEMPTS (int): Number of attempts to retry when + receiving connection errors, each will backoff exponentially longer. """ NAME: str = abc.abstractproperty() # type: ignore From 3cc2a26c7b21914cd7d58bf6e32999fe5575a6d3 Mon Sep 17 00:00:00 2001 From: Nick Mills-Barrett Date: Wed, 9 Mar 2022 14:25:31 +0000 Subject: [PATCH 5/5] Include exception and delay time in retry log --- synapse/replication/http/_base.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py index 736d0121164b..f1abb986534b 100644 --- a/synapse/replication/http/_base.py +++ b/synapse/replication/http/_base.py @@ -269,17 +269,21 @@ async def send_request(*, instance_name: str = "master", **kwargs: Any) -> Any: # If we timed out we probably don't need to worry about backing # off too much, but lets just wait a little anyway. await clock.sleep(1) - except (ConnectError, DNSLookupError): + except (ConnectError, DNSLookupError) as e: if not cls.RETRY_ON_CONNECT_ERROR: raise if attempts > cls.RETRY_ON_CONNECT_ERROR_ATTEMPTS: raise + delay = 2 ** attempts logger.warning( - "%s request connection failed; retrying", cls.NAME + "%s request connection failed; retrying in %ds: %r", + cls.NAME, + delay, + e, ) - await clock.sleep(2 ** attempts) + await clock.sleep(delay) attempts += 1 except HttpResponseException as e: # We convert to SynapseError as we know that it was a SynapseError