matrix-org · richvdh · Nov 20, 2018 · hawkowl · Nov 21, 2018 · richvdh
@@ -0,0 +1 @@
+Fix logcontext leaks in EmailPusher and in tests
@@ -163,7 +163,7 @@ the logcontext was set, this will make things work out ok: provided
 It's all too easy to forget to ``yield``: for instance if we forgot that
 ``do_some_stuff`` returned a deferred, we might plough on regardless. This
 leads to a mess; it will probably work itself out eventually, but not before
-a load of stuff has been logged against the wrong content. (Normally, other
+a load of stuff has been logged against the wrong context. (Normally, other
 things will break, more obviously, if you forget to ``yield``, so this tends
 not to be a major problem in practice.)
 
@@ -440,3 +440,59 @@ To conclude: I think this scheme would have worked equally well, with less
 danger of messing it up, and probably made some more esoteric code easier to
 write. But again — changing the conventions of the entire Synapse codebase is
 not a sensible option for the marginal improvement offered.
+
+
+A note on garbage-collection of Deferred chains
+-----------------------------------------------
+
+It turns out that our logcontext rules do not play nicely with Deferred
+chains which get orphaned and garbage-collected.
+
+Imagine we have some code that looks like this:
+
+.. code:: python
+
+    listener_queue = []
+
+    def on_something_interesting():
+        for d in listener_queue:
+            d.callback("foo")
+
+    @defer.inlineCallbacks
+    def await_something_interesting():
+        new_deferred = defer.Deferred()
+        listener_queue.append(new_deferred)
+
+        with PreserveLoggingContext():
+            yield new_deferred
+
+Obviously, the idea here is that we have a bunch of things which are waiting
+for an event. (It's just an example of the problem here, but a relatively
+common one.)
+
+Now let's imagine two further things happen. First of all, whatever was
+waiting for the interesting thing goes away. (Perhaps the request times out,
+or something *even more* interesting happens.)
+
+Secondly, let's suppose that we decide that the interesting thing is never
+going to happen, and we reset the listener queue:
+
+.. code:: python
+
+    def reset_listener_queue():
+        listener_queue.clear()
+
+So, both ends of the deferred chain have now dropped their references, and the
+deferred chain is now orphaned, and will be garbage-collected at some point.
+Note that ``await_something_interesting`` is a generator function, and when
+Python garbage-collects generator functions, it gives them a chance to clean
+up by making the ``yield`` raise a ``GeneratorExit`` exception. In our case,
+that means that the ``__exit__`` handler of ``PreserveLoggingContext`` will
+carefully restore the request context, but there is now nothing waiting for
+its return, so the request context is never cleared.
+
+To reiterate, this problem only arises when *both* ends of a deferred chain
+are dropped. Dropping the the reference to a deferred you're supposed to be
+calling is probably bad practice, so this doesn't actually happen too much.
+Unfortunately, when it does happen, it will lead to leaked logcontexts which
+are incredibly hard to track down.
@@ -19,7 +19,6 @@
 from synapse.rest.client.v2_alpha import sync
 
 from tests import unittest
-from tests.server import TimedOutException
 
 
 class FilterTestCase(unittest.HomeserverTestCase):
@@ -88,7 +87,7 @@ def test_sync_backwards_typing(self):
         than the now-reset serial.
         """
         typing_url = "/rooms/%s/typing/%s?access_token=%s"
-        sync_url = "/sync?timeout=3000000&access_token=%s&since=%s"
+        sync_url = "/sync?timeout=3000&access_token=%s&since=%s"
 
         # Register the user who gets notified
         user_id = self.register_user("user", "pass")
@@ -177,14 +176,24 @@ def test_sync_backwards_typing(self):
         )
         self.render(request)
         self.assertEquals(200, channel.code)
+        self.assertEqual(
+            len(channel.json_body["rooms"]["join"][room]["ephemeral"]["events"]), 1,
+        )
         next_batch = channel.json_body["next_batch"]
 
         # Clear the typing information, so that it doesn't think everything is
         # in the future.
         typing._reset()
 
-        # Now it SHOULD fail as it never completes!
+        # Now we should get a null response back.
         request, channel = self.make_request(
             "GET", sync_url % (access_token, next_batch)
         )
-        self.assertRaises(TimedOutException, self.render, request)
+        self.render(request)
+        self.assertEquals(200, channel.code)
+
+        # rooms list should be empty
+        self.assertEqual(channel.json_body["rooms"]["join"], {})
+
+        # sync token should be the same as before
+        self.assertEqual(channel.json_body["next_batch"], next_batch)
@@ -190,24 +190,34 @@ def make_request(
     return req, channel
 
 
-def wait_until_result(clock, request, timeout=100):
+def wait_until_result(clock, request):
     """
     Wait until the request is finished.
     """
+    # this function used to give up if the request hadn't completed within 100
+    # iterations. However, this led to problems with log contexts.
+    #
+    # The problem is that because we (a) give up waiting for the result of the
+    # request, and (b) throw away the HomeServer instance (and hence the Notifier,
+    # which would otherwise keep references to requests it is supposed to be
+    # waking up), we can end up with orphaned chains of incomplete Deferreds
+    # which no longer have any references to them and so get garbage-collected.
+    #
+    # It further turns out that one of the many unpleasant effects of the way we
+    # do logcontexts is that if a deferred chain gets garbage-collected before
+    # it completes, we can end up leaking log-contexts (see log_contexts.rst
+    # for more details).
+    #
+    # In short, a timeout here makes things explode down the line.
+
     clock.run()
-    x = 0
 
     while not request.finished:
 
         # If there's a producer, tell it to resume producing so we get content
         if request._channel._producer:
             request._channel._producer.resumeProducing()
 
-        x += 1
-
-        if x > timeout:
-            raise TimedOutException("Timed out waiting for request to finish.")
-
         clock.advance(0.1)