From 77dfc1f93967f4157ba961c3b5201206c1bbf797 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 17 Oct 2023 07:32:40 -0400 Subject: [PATCH] Fix a bug where servers could be marked as up when they were failing (#16506) After this change a server will only be reported as back online if they were previously having requests fail. --- changelog.d/16506.bugfix | 1 + synapse/util/retryutils.py | 28 +++++++------ tests/util/test_retryutils.py | 75 +++++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 12 deletions(-) create mode 100644 changelog.d/16506.bugfix diff --git a/changelog.d/16506.bugfix b/changelog.d/16506.bugfix new file mode 100644 index 0000000000..a2c7e82b9e --- /dev/null +++ b/changelog.d/16506.bugfix @@ -0,0 +1 @@ +Fix a bug introduced in Synapse 1.59.0 where servers would be incorrectly marked as available when a request resulted in an error. diff --git a/synapse/util/retryutils.py b/synapse/util/retryutils.py index 0e1f907667..547202c96b 100644 --- a/synapse/util/retryutils.py +++ b/synapse/util/retryutils.py @@ -170,10 +170,10 @@ class RetryDestinationLimiter: database in milliseconds, or zero if the last request was successful. backoff_on_404: Back off if we get a 404 - backoff_on_failure: set to False if we should not increase the retry interval on a failure. - + notifier: A notifier used to mark servers as up. + replication_client A replication client used to mark servers as up. backoff_on_all_error_codes: Whether we should back off on any error code. """ @@ -237,6 +237,9 @@ class RetryDestinationLimiter: else: valid_err_code = False + # Whether previous requests to the destination had been failing. + previously_failing = bool(self.failure_ts) + if success: # We connected successfully. if not self.retry_interval: @@ -282,6 +285,9 @@ class RetryDestinationLimiter: if self.failure_ts is None: self.failure_ts = retry_last_ts + # Whether the current request to the destination had been failing. + currently_failing = bool(self.failure_ts) + async def store_retry_timings() -> None: try: await self.store.set_destination_retry_timings( @@ -291,17 +297,15 @@ class RetryDestinationLimiter: self.retry_interval, ) - if self.notifier: - # Inform the relevant places that the remote server is back up. - self.notifier.notify_remote_server_up(self.destination) + # If the server was previously failing, but is no longer. + if previously_failing and not currently_failing: + if self.notifier: + # Inform the relevant places that the remote server is back up. + self.notifier.notify_remote_server_up(self.destination) - if self.replication_client: - # If we're on a worker we try and inform master about this. The - # replication client doesn't hook into the notifier to avoid - # infinite loops where we send a `REMOTE_SERVER_UP` command to - # master, which then echoes it back to us which in turn pokes - # the notifier. - self.replication_client.send_remote_server_up(self.destination) + if self.replication_client: + # Inform other workers that the remote server is up. + self.replication_client.send_remote_server_up(self.destination) except Exception: logger.exception("Failed to store destination_retry_timings") diff --git a/tests/util/test_retryutils.py b/tests/util/test_retryutils.py index 4bcd17a6fc..ad88b24566 100644 --- a/tests/util/test_retryutils.py +++ b/tests/util/test_retryutils.py @@ -11,6 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock + +from synapse.notifier import Notifier +from synapse.replication.tcp.handler import ReplicationCommandHandler from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter from tests.unittest import HomeserverTestCase @@ -109,6 +113,77 @@ class RetryLimiterTestCase(HomeserverTestCase): new_timings = self.get_success(store.get_destination_retry_timings("test_dest")) self.assertIsNone(new_timings) + def test_notifier_replication(self) -> None: + """Ensure the notifier/replication client is called only when expected.""" + store = self.hs.get_datastores().main + + notifier = mock.Mock(spec=Notifier) + replication_client = mock.Mock(spec=ReplicationCommandHandler) + + limiter = self.get_success( + get_retry_limiter( + "test_dest", + self.clock, + store, + notifier=notifier, + replication_client=replication_client, + ) + ) + + # The server is already up, nothing should occur. + self.pump(1) + with limiter: + pass + self.pump() + + new_timings = self.get_success(store.get_destination_retry_timings("test_dest")) + self.assertIsNone(new_timings) + notifier.notify_remote_server_up.assert_not_called() + replication_client.send_remote_server_up.assert_not_called() + + # Attempt again, but return an error. This will cause new retry timings, but + # should not trigger server up notifications. + self.pump(1) + try: + with limiter: + raise AssertionError("argh") + except AssertionError: + pass + self.pump() + + new_timings = self.get_success(store.get_destination_retry_timings("test_dest")) + # The exact retry timings are tested separately. + self.assertIsNotNone(new_timings) + notifier.notify_remote_server_up.assert_not_called() + replication_client.send_remote_server_up.assert_not_called() + + # A second failing request should be treated as the above. + self.pump(1) + try: + with limiter: + raise AssertionError("argh") + except AssertionError: + pass + self.pump() + + new_timings = self.get_success(store.get_destination_retry_timings("test_dest")) + # The exact retry timings are tested separately. + self.assertIsNotNone(new_timings) + notifier.notify_remote_server_up.assert_not_called() + replication_client.send_remote_server_up.assert_not_called() + + # A final successful attempt should generate a server up notification. + self.pump(1) + with limiter: + pass + self.pump() + + new_timings = self.get_success(store.get_destination_retry_timings("test_dest")) + # The exact retry timings are tested separately. + self.assertIsNone(new_timings) + notifier.notify_remote_server_up.assert_called_once_with("test_dest") + replication_client.send_remote_server_up.assert_called_once_with("test_dest") + def test_max_retry_interval(self) -> None: """Test that `destination_max_retry_interval` setting works as expected""" store = self.hs.get_datastores().main