From ab2044635922dab9fd3f566f539f7d2e65d4bb75 Mon Sep 17 00:00:00 2001
From: Nick Mills-Barrett <nick@beeper.com>
Date: Tue, 8 Mar 2022 17:40:22 +0000
Subject: [PATCH 1/5] Retry replication HTTP requests that fail to connect

This allows for the target process to be down for around a minute
which provides time for restarts during synapse upgrades/config updates.
---
 synapse/replication/http/_base.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py
index 2e697c74a6bb..c6c9b0d56010 100644
--- a/synapse/replication/http/_base.py
+++ b/synapse/replication/http/_base.py
@@ -22,6 +22,7 @@
 from prometheus_client import Counter, Gauge
 
 from twisted.web.server import Request
+from twisted.internet.error import ConnectError, DNSLookupError
 
 from synapse.api.errors import HttpResponseException, SynapseError
 from synapse.http import RequestTimedOutError
@@ -94,6 +95,8 @@ class ReplicationEndpoint(metaclass=abc.ABCMeta):
     METHOD = "POST"
     CACHE = True
     RETRY_ON_TIMEOUT = True
+    RETRY_ON_CONNECT_ERROR = True
+    RETRY_ON_CONNECT_ERROR_ATTEMPTS = 5  # =63s (2^6-1)
 
     def __init__(self, hs: "HomeServer"):
         if self.CACHE:
@@ -237,6 +240,9 @@ async def send_request(*, instance_name: str = "master", **kwargs: Any) -> Any:
                 )
 
                 try:
+                    # Keep track of attempts made so we can bail if we don't manage to
+                    # connect to the target after N tries.
+                    attempts = 0
                     # We keep retrying the same request for timeouts. This is so that we
                     # have a good idea that the request has either succeeded or failed
                     # on the master, and so whether we should clean up or not.
@@ -255,11 +261,21 @@ async def send_request(*, instance_name: str = "master", **kwargs: Any) -> Any:
                             if not cls.RETRY_ON_TIMEOUT:
                                 raise
 
-                        logger.warning("%s request timed out; retrying", cls.NAME)
+                            logger.warning("%s request timed out; retrying", cls.NAME)
 
-                        # If we timed out we probably don't need to worry about backing
-                        # off too much, but lets just wait a little anyway.
-                        await clock.sleep(1)
+                            # If we timed out we probably don't need to worry about backing
+                            # off too much, but lets just wait a little anyway.
+                            await clock.sleep(1)
+                        except (ConnectError, DNSLookupError):
+                            if not cls.RETRY_ON_CONNECT_ERROR:
+                                raise
+                            if attempts > cls.RETRY_ON_CONNECT_ERROR_ATTEMPTS:
+                                raise
+
+                            logger.warning("%s request connection failed; retrying", cls.NAME)
+
+                            await clock.sleep(2 ** attempts)
+                            attempts += 1
                 except HttpResponseException as e:
                     # We convert to SynapseError as we know that it was a SynapseError
                     # on the main process that we should send to the client. (And

From 4dc23f5c2c44ae8f52ad0a04ed0acc5ac80540b3 Mon Sep 17 00:00:00 2001
From: Nick Mills-Barrett <nick@beeper.com>
Date: Tue, 8 Mar 2022 17:41:02 +0000
Subject: [PATCH 2/5] Move header dict creation out of the request loop

Tiny optimisation / code readability improvement.
---
 synapse/replication/http/_base.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py
index c6c9b0d56010..4aca8e7a1bbe 100644
--- a/synapse/replication/http/_base.py
+++ b/synapse/replication/http/_base.py
@@ -21,8 +21,8 @@
 
 from prometheus_client import Counter, Gauge
 
-from twisted.web.server import Request
 from twisted.internet.error import ConnectError, DNSLookupError
+from twisted.web.server import Request
 
 from synapse.api.errors import HttpResponseException, SynapseError
 from synapse.http import RequestTimedOutError
@@ -239,6 +239,12 @@ async def send_request(*, instance_name: str = "master", **kwargs: Any) -> Any:
                     "/".join(url_args),
                 )
 
+                headers: Dict[bytes, List[bytes]] = {}
+                # Add an authorization header, if configured.
+                if replication_secret:
+                    headers[b"Authorization"] = [b"Bearer " + replication_secret]
+                opentracing.inject_header_dict(headers, check_destination=False)
+
                 try:
                     # Keep track of attempts made so we can bail if we don't manage to
                     # connect to the target after N tries.
@@ -247,13 +253,6 @@ async def send_request(*, instance_name: str = "master", **kwargs: Any) -> Any:
                     # have a good idea that the request has either succeeded or failed
                     # on the master, and so whether we should clean up or not.
                     while True:
-                        headers: Dict[bytes, List[bytes]] = {}
-                        # Add an authorization header, if configured.
-                        if replication_secret:
-                            headers[b"Authorization"] = [
-                                b"Bearer " + replication_secret
-                            ]
-                        opentracing.inject_header_dict(headers, check_destination=False)
                         try:
                             result = await request_func(uri, data, headers=headers)
                             break
@@ -272,7 +271,9 @@ async def send_request(*, instance_name: str = "master", **kwargs: Any) -> Any:
                             if attempts > cls.RETRY_ON_CONNECT_ERROR_ATTEMPTS:
                                 raise
 
-                            logger.warning("%s request connection failed; retrying", cls.NAME)
+                            logger.warning(
+                                "%s request connection failed; retrying", cls.NAME
+                            )
 
                             await clock.sleep(2 ** attempts)
                             attempts += 1

From 5c4e815bfca59c0616075630aa4d04626a9ce5c8 Mon Sep 17 00:00:00 2001
From: Nick Mills-Barrett <nick@beeper.com>
Date: Tue, 8 Mar 2022 17:51:29 +0000
Subject: [PATCH 3/5] Add changelog file

---
 changelog.d/12182.misc | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelog.d/12182.misc

diff --git a/changelog.d/12182.misc b/changelog.d/12182.misc
new file mode 100644
index 000000000000..7e9ad2c75244
--- /dev/null
+++ b/changelog.d/12182.misc
@@ -0,0 +1 @@
+Retry HTTP replication failures, this should prevent 502's when restarting stateful workers (main, event persisters, stream writers). Contributed by Nick @ Beeper.

From bde45663d04bc04902654abe457faaabb97ac136 Mon Sep 17 00:00:00 2001
From: Nick Mills-Barrett <nick@beeper.com>
Date: Wed, 9 Mar 2022 14:25:19 +0000
Subject: [PATCH 4/5] Add retry on connect error variables to docstring

---
 synapse/replication/http/_base.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py
index 4aca8e7a1bbe..736d0121164b 100644
--- a/synapse/replication/http/_base.py
+++ b/synapse/replication/http/_base.py
@@ -88,6 +88,10 @@ class ReplicationEndpoint(metaclass=abc.ABCMeta):
             `_handle_request` must return a Deferred.
         RETRY_ON_TIMEOUT(bool): Whether or not to retry the request when a 504
             is received.
+        RETRY_ON_CONNECT_ERROR (bool): Whether or not to retry the request when
+            a connection error is received.
+        RETRY_ON_CONNECT_ERROR_ATTEMPTS (int): Number of attempts to retry when
+            receiving connection errors, each will backoff exponentially longer.
     """
 
     NAME: str = abc.abstractproperty()  # type: ignore

From 3cc2a26c7b21914cd7d58bf6e32999fe5575a6d3 Mon Sep 17 00:00:00 2001
From: Nick Mills-Barrett <nick@beeper.com>
Date: Wed, 9 Mar 2022 14:25:31 +0000
Subject: [PATCH 5/5] Include exception and delay time in retry log

---
 synapse/replication/http/_base.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py
index 736d0121164b..f1abb986534b 100644
--- a/synapse/replication/http/_base.py
+++ b/synapse/replication/http/_base.py
@@ -269,17 +269,21 @@ async def send_request(*, instance_name: str = "master", **kwargs: Any) -> Any:
                             # If we timed out we probably don't need to worry about backing
                             # off too much, but lets just wait a little anyway.
                             await clock.sleep(1)
-                        except (ConnectError, DNSLookupError):
+                        except (ConnectError, DNSLookupError) as e:
                             if not cls.RETRY_ON_CONNECT_ERROR:
                                 raise
                             if attempts > cls.RETRY_ON_CONNECT_ERROR_ATTEMPTS:
                                 raise
 
+                            delay = 2 ** attempts
                             logger.warning(
-                                "%s request connection failed; retrying", cls.NAME
+                                "%s request connection failed; retrying in %ds: %r",
+                                cls.NAME,
+                                delay,
+                                e,
                             )
 
-                            await clock.sleep(2 ** attempts)
+                            await clock.sleep(delay)
                             attempts += 1
                 except HttpResponseException as e:
                     # We convert to SynapseError as we know that it was a SynapseError