Fix a bug where servers could be marked as up when they were failing (#16506)

After this change a server will only be reported as back online
if they were previously having requests fail.
This commit is contained in:
Patrick Cloke 2023-10-17 07:32:40 -04:00 committed by GitHub
parent 7291c68eea
commit 77dfc1f939
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 92 additions and 12 deletions

1
changelog.d/16506.bugfix Normal file
View File

@ -0,0 +1 @@
Fix a bug introduced in Synapse 1.59.0 where servers would be incorrectly marked as available when a request resulted in an error.

View File

@ -170,10 +170,10 @@ class RetryDestinationLimiter:
database in milliseconds, or zero if the last request was database in milliseconds, or zero if the last request was
successful. successful.
backoff_on_404: Back off if we get a 404 backoff_on_404: Back off if we get a 404
backoff_on_failure: set to False if we should not increase the backoff_on_failure: set to False if we should not increase the
retry interval on a failure. retry interval on a failure.
notifier: A notifier used to mark servers as up.
replication_client A replication client used to mark servers as up.
backoff_on_all_error_codes: Whether we should back off on any backoff_on_all_error_codes: Whether we should back off on any
error code. error code.
""" """
@ -237,6 +237,9 @@ class RetryDestinationLimiter:
else: else:
valid_err_code = False valid_err_code = False
# Whether previous requests to the destination had been failing.
previously_failing = bool(self.failure_ts)
if success: if success:
# We connected successfully. # We connected successfully.
if not self.retry_interval: if not self.retry_interval:
@ -282,6 +285,9 @@ class RetryDestinationLimiter:
if self.failure_ts is None: if self.failure_ts is None:
self.failure_ts = retry_last_ts self.failure_ts = retry_last_ts
# Whether the current request to the destination had been failing.
currently_failing = bool(self.failure_ts)
async def store_retry_timings() -> None: async def store_retry_timings() -> None:
try: try:
await self.store.set_destination_retry_timings( await self.store.set_destination_retry_timings(
@ -291,17 +297,15 @@ class RetryDestinationLimiter:
self.retry_interval, self.retry_interval,
) )
if self.notifier: # If the server was previously failing, but is no longer.
# Inform the relevant places that the remote server is back up. if previously_failing and not currently_failing:
self.notifier.notify_remote_server_up(self.destination) if self.notifier:
# Inform the relevant places that the remote server is back up.
self.notifier.notify_remote_server_up(self.destination)
if self.replication_client: if self.replication_client:
# If we're on a worker we try and inform master about this. The # Inform other workers that the remote server is up.
# replication client doesn't hook into the notifier to avoid self.replication_client.send_remote_server_up(self.destination)
# infinite loops where we send a `REMOTE_SERVER_UP` command to
# master, which then echoes it back to us which in turn pokes
# the notifier.
self.replication_client.send_remote_server_up(self.destination)
except Exception: except Exception:
logger.exception("Failed to store destination_retry_timings") logger.exception("Failed to store destination_retry_timings")

View File

@ -11,6 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from unittest import mock
from synapse.notifier import Notifier
from synapse.replication.tcp.handler import ReplicationCommandHandler
from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter
from tests.unittest import HomeserverTestCase from tests.unittest import HomeserverTestCase
@ -109,6 +113,77 @@ class RetryLimiterTestCase(HomeserverTestCase):
new_timings = self.get_success(store.get_destination_retry_timings("test_dest")) new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
self.assertIsNone(new_timings) self.assertIsNone(new_timings)
def test_notifier_replication(self) -> None:
"""Ensure the notifier/replication client is called only when expected."""
store = self.hs.get_datastores().main
notifier = mock.Mock(spec=Notifier)
replication_client = mock.Mock(spec=ReplicationCommandHandler)
limiter = self.get_success(
get_retry_limiter(
"test_dest",
self.clock,
store,
notifier=notifier,
replication_client=replication_client,
)
)
# The server is already up, nothing should occur.
self.pump(1)
with limiter:
pass
self.pump()
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
self.assertIsNone(new_timings)
notifier.notify_remote_server_up.assert_not_called()
replication_client.send_remote_server_up.assert_not_called()
# Attempt again, but return an error. This will cause new retry timings, but
# should not trigger server up notifications.
self.pump(1)
try:
with limiter:
raise AssertionError("argh")
except AssertionError:
pass
self.pump()
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
# The exact retry timings are tested separately.
self.assertIsNotNone(new_timings)
notifier.notify_remote_server_up.assert_not_called()
replication_client.send_remote_server_up.assert_not_called()
# A second failing request should be treated as the above.
self.pump(1)
try:
with limiter:
raise AssertionError("argh")
except AssertionError:
pass
self.pump()
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
# The exact retry timings are tested separately.
self.assertIsNotNone(new_timings)
notifier.notify_remote_server_up.assert_not_called()
replication_client.send_remote_server_up.assert_not_called()
# A final successful attempt should generate a server up notification.
self.pump(1)
with limiter:
pass
self.pump()
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
# The exact retry timings are tested separately.
self.assertIsNone(new_timings)
notifier.notify_remote_server_up.assert_called_once_with("test_dest")
replication_client.send_remote_server_up.assert_called_once_with("test_dest")
def test_max_retry_interval(self) -> None: def test_max_retry_interval(self) -> None:
"""Test that `destination_max_retry_interval` setting works as expected""" """Test that `destination_max_retry_interval` setting works as expected"""
store = self.hs.get_datastores().main store = self.hs.get_datastores().main