Allow config of the backoff algorithm for the federation client. (#15754)

Adds three new configuration variables:

* destination_min_retry_interval is identical to before (10mn).
* destination_retry_multiplier is now 2 instead of 5, the maximum value will
  be reached slower.
* destination_max_retry_interval is one day instead of (essentially) infinity.

Capping this will cause destinations to continue to be retried sometimes instead
of being lost forever. The previous value was 2 ^ 62 milliseconds.
This commit is contained in:
Mathieu Velten 2023-08-03 20:36:55 +02:00 committed by GitHub
parent 9c462f18a4
commit f0a860908b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 64 additions and 26 deletions

1
changelog.d/15754.misc Normal file
View File

@ -0,0 +1 @@
Allow for the configuration of the backoff algorithm for federation destinations.

View File

@ -1242,6 +1242,14 @@ like sending a federation transaction.
* `max_short_retries`: maximum number of retries for the short retry algo. Default to 3 attempts. * `max_short_retries`: maximum number of retries for the short retry algo. Default to 3 attempts.
* `max_long_retries`: maximum number of retries for the long retry algo. Default to 10 attempts. * `max_long_retries`: maximum number of retries for the long retry algo. Default to 10 attempts.
The following options control the retry logic when communicating with a specific homeserver destination.
Unlike the previous configuration options, these values apply across all requests
for a given destination and the state of the backoff is stored in the database.
* `destination_min_retry_interval`: the initial backoff, after the first request fails. Defaults to 10m.
* `destination_retry_multiplier`: how much we multiply the backoff by after each subsequent fail. Defaults to 2.
* `destination_max_retry_interval`: a cap on the backoff. Defaults to a week.
Example configuration: Example configuration:
```yaml ```yaml
federation: federation:
@ -1250,6 +1258,9 @@ federation:
max_long_retry_delay: 100s max_long_retry_delay: 100s
max_short_retries: 5 max_short_retries: 5
max_long_retries: 20 max_long_retries: 20
destination_min_retry_interval: 30s
destination_retry_multiplier: 5
destination_max_retry_interval: 12h
``` ```
--- ---
## Caching ## Caching

View File

@ -65,5 +65,23 @@ class FederationConfig(Config):
self.max_long_retries = federation_config.get("max_long_retries", 10) self.max_long_retries = federation_config.get("max_long_retries", 10)
self.max_short_retries = federation_config.get("max_short_retries", 3) self.max_short_retries = federation_config.get("max_short_retries", 3)
# Allow for the configuration of the backoff algorithm used
# when trying to reach an unavailable destination.
# Unlike previous configuration those values applies across
# multiple requests and the state of the backoff is stored on DB.
self.destination_min_retry_interval_ms = Config.parse_duration(
federation_config.get("destination_min_retry_interval", "10m")
)
self.destination_retry_multiplier = federation_config.get(
"destination_retry_multiplier", 2
)
self.destination_max_retry_interval_ms = min(
Config.parse_duration(
federation_config.get("destination_max_retry_interval", "7d")
),
# Set a hard-limit to not overflow the database column.
2**62,
)
_METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}} _METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}}

View File

@ -27,15 +27,6 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# the initial backoff, after the first transaction fails
MIN_RETRY_INTERVAL = 10 * 60 * 1000
# how much we multiply the backoff by after each subsequent fail
RETRY_MULTIPLIER = 5
# a cap on the backoff. (Essentially none)
MAX_RETRY_INTERVAL = 2**62
class NotRetryingDestination(Exception): class NotRetryingDestination(Exception):
def __init__(self, retry_last_ts: int, retry_interval: int, destination: str): def __init__(self, retry_last_ts: int, retry_interval: int, destination: str):
@ -169,6 +160,16 @@ class RetryDestinationLimiter:
self.notifier = notifier self.notifier = notifier
self.replication_client = replication_client self.replication_client = replication_client
self.destination_min_retry_interval_ms = (
self.store.hs.config.federation.destination_min_retry_interval_ms
)
self.destination_retry_multiplier = (
self.store.hs.config.federation.destination_retry_multiplier
)
self.destination_max_retry_interval_ms = (
self.store.hs.config.federation.destination_max_retry_interval_ms
)
def __enter__(self) -> None: def __enter__(self) -> None:
pass pass
@ -220,13 +221,15 @@ class RetryDestinationLimiter:
# We couldn't connect. # We couldn't connect.
if self.retry_interval: if self.retry_interval:
self.retry_interval = int( self.retry_interval = int(
self.retry_interval * RETRY_MULTIPLIER * random.uniform(0.8, 1.4) self.retry_interval
* self.destination_retry_multiplier
* random.uniform(0.8, 1.4)
) )
if self.retry_interval >= MAX_RETRY_INTERVAL: if self.retry_interval >= self.destination_max_retry_interval_ms:
self.retry_interval = MAX_RETRY_INTERVAL self.retry_interval = self.destination_max_retry_interval_ms
else: else:
self.retry_interval = MIN_RETRY_INTERVAL self.retry_interval = self.destination_min_retry_interval_ms
logger.info( logger.info(
"Connection to %s was unsuccessful (%s(%s)); backoff now %i", "Connection to %s was unsuccessful (%s(%s)); backoff now %i",

View File

@ -17,7 +17,6 @@ from twisted.test.proto_helpers import MemoryReactor
from synapse.server import HomeServer from synapse.server import HomeServer
from synapse.storage.databases.main.transactions import DestinationRetryTimings from synapse.storage.databases.main.transactions import DestinationRetryTimings
from synapse.util import Clock from synapse.util import Clock
from synapse.util.retryutils import MAX_RETRY_INTERVAL
from tests.unittest import HomeserverTestCase from tests.unittest import HomeserverTestCase
@ -57,8 +56,14 @@ class TransactionStoreTestCase(HomeserverTestCase):
self.get_success(d) self.get_success(d)
def test_large_destination_retry(self) -> None: def test_large_destination_retry(self) -> None:
max_retry_interval_ms = (
self.hs.config.federation.destination_max_retry_interval_ms
)
d = self.store.set_destination_retry_timings( d = self.store.set_destination_retry_timings(
"example.com", MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL "example.com",
max_retry_interval_ms,
max_retry_interval_ms,
max_retry_interval_ms,
) )
self.get_success(d) self.get_success(d)

View File

@ -11,12 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from synapse.util.retryutils import ( from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter
MIN_RETRY_INTERVAL,
RETRY_MULTIPLIER,
NotRetryingDestination,
get_retry_limiter,
)
from tests.unittest import HomeserverTestCase from tests.unittest import HomeserverTestCase
@ -42,6 +37,11 @@ class RetryLimiterTestCase(HomeserverTestCase):
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store)) limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
min_retry_interval_ms = (
self.hs.config.federation.destination_min_retry_interval_ms
)
retry_multiplier = self.hs.config.federation.destination_retry_multiplier
self.pump(1) self.pump(1)
try: try:
with limiter: with limiter:
@ -57,7 +57,7 @@ class RetryLimiterTestCase(HomeserverTestCase):
assert new_timings is not None assert new_timings is not None
self.assertEqual(new_timings.failure_ts, failure_ts) self.assertEqual(new_timings.failure_ts, failure_ts)
self.assertEqual(new_timings.retry_last_ts, failure_ts) self.assertEqual(new_timings.retry_last_ts, failure_ts)
self.assertEqual(new_timings.retry_interval, MIN_RETRY_INTERVAL) self.assertEqual(new_timings.retry_interval, min_retry_interval_ms)
# now if we try again we should get a failure # now if we try again we should get a failure
self.get_failure( self.get_failure(
@ -68,7 +68,7 @@ class RetryLimiterTestCase(HomeserverTestCase):
# advance the clock and try again # advance the clock and try again
# #
self.pump(MIN_RETRY_INTERVAL) self.pump(min_retry_interval_ms)
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store)) limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
self.pump(1) self.pump(1)
@ -87,16 +87,16 @@ class RetryLimiterTestCase(HomeserverTestCase):
self.assertEqual(new_timings.failure_ts, failure_ts) self.assertEqual(new_timings.failure_ts, failure_ts)
self.assertEqual(new_timings.retry_last_ts, retry_ts) self.assertEqual(new_timings.retry_last_ts, retry_ts)
self.assertGreaterEqual( self.assertGreaterEqual(
new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 0.5 new_timings.retry_interval, min_retry_interval_ms * retry_multiplier * 0.5
) )
self.assertLessEqual( self.assertLessEqual(
new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0 new_timings.retry_interval, min_retry_interval_ms * retry_multiplier * 2.0
) )
# #
# one more go, with success # one more go, with success
# #
self.reactor.advance(MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0) self.reactor.advance(min_retry_interval_ms * retry_multiplier * 2.0)
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store)) limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
self.pump(1) self.pump(1)