Allow config of the backoff algorithm for the federation client. (#15754)
Adds three new configuration variables: * destination_min_retry_interval is identical to before (10mn). * destination_retry_multiplier is now 2 instead of 5, the maximum value will be reached slower. * destination_max_retry_interval is one day instead of (essentially) infinity. Capping this will cause destinations to continue to be retried sometimes instead of being lost forever. The previous value was 2 ^ 62 milliseconds.
This commit is contained in:
parent
9c462f18a4
commit
f0a860908b
|
@ -0,0 +1 @@
|
||||||
|
Allow for the configuration of the backoff algorithm for federation destinations.
|
|
@ -1242,6 +1242,14 @@ like sending a federation transaction.
|
||||||
* `max_short_retries`: maximum number of retries for the short retry algo. Default to 3 attempts.
|
* `max_short_retries`: maximum number of retries for the short retry algo. Default to 3 attempts.
|
||||||
* `max_long_retries`: maximum number of retries for the long retry algo. Default to 10 attempts.
|
* `max_long_retries`: maximum number of retries for the long retry algo. Default to 10 attempts.
|
||||||
|
|
||||||
|
The following options control the retry logic when communicating with a specific homeserver destination.
|
||||||
|
Unlike the previous configuration options, these values apply across all requests
|
||||||
|
for a given destination and the state of the backoff is stored in the database.
|
||||||
|
|
||||||
|
* `destination_min_retry_interval`: the initial backoff, after the first request fails. Defaults to 10m.
|
||||||
|
* `destination_retry_multiplier`: how much we multiply the backoff by after each subsequent fail. Defaults to 2.
|
||||||
|
* `destination_max_retry_interval`: a cap on the backoff. Defaults to a week.
|
||||||
|
|
||||||
Example configuration:
|
Example configuration:
|
||||||
```yaml
|
```yaml
|
||||||
federation:
|
federation:
|
||||||
|
@ -1250,6 +1258,9 @@ federation:
|
||||||
max_long_retry_delay: 100s
|
max_long_retry_delay: 100s
|
||||||
max_short_retries: 5
|
max_short_retries: 5
|
||||||
max_long_retries: 20
|
max_long_retries: 20
|
||||||
|
destination_min_retry_interval: 30s
|
||||||
|
destination_retry_multiplier: 5
|
||||||
|
destination_max_retry_interval: 12h
|
||||||
```
|
```
|
||||||
---
|
---
|
||||||
## Caching
|
## Caching
|
||||||
|
|
|
@ -65,5 +65,23 @@ class FederationConfig(Config):
|
||||||
self.max_long_retries = federation_config.get("max_long_retries", 10)
|
self.max_long_retries = federation_config.get("max_long_retries", 10)
|
||||||
self.max_short_retries = federation_config.get("max_short_retries", 3)
|
self.max_short_retries = federation_config.get("max_short_retries", 3)
|
||||||
|
|
||||||
|
# Allow for the configuration of the backoff algorithm used
|
||||||
|
# when trying to reach an unavailable destination.
|
||||||
|
# Unlike previous configuration those values applies across
|
||||||
|
# multiple requests and the state of the backoff is stored on DB.
|
||||||
|
self.destination_min_retry_interval_ms = Config.parse_duration(
|
||||||
|
federation_config.get("destination_min_retry_interval", "10m")
|
||||||
|
)
|
||||||
|
self.destination_retry_multiplier = federation_config.get(
|
||||||
|
"destination_retry_multiplier", 2
|
||||||
|
)
|
||||||
|
self.destination_max_retry_interval_ms = min(
|
||||||
|
Config.parse_duration(
|
||||||
|
federation_config.get("destination_max_retry_interval", "7d")
|
||||||
|
),
|
||||||
|
# Set a hard-limit to not overflow the database column.
|
||||||
|
2**62,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
_METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}}
|
_METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}}
|
||||||
|
|
|
@ -27,15 +27,6 @@ if TYPE_CHECKING:
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# the initial backoff, after the first transaction fails
|
|
||||||
MIN_RETRY_INTERVAL = 10 * 60 * 1000
|
|
||||||
|
|
||||||
# how much we multiply the backoff by after each subsequent fail
|
|
||||||
RETRY_MULTIPLIER = 5
|
|
||||||
|
|
||||||
# a cap on the backoff. (Essentially none)
|
|
||||||
MAX_RETRY_INTERVAL = 2**62
|
|
||||||
|
|
||||||
|
|
||||||
class NotRetryingDestination(Exception):
|
class NotRetryingDestination(Exception):
|
||||||
def __init__(self, retry_last_ts: int, retry_interval: int, destination: str):
|
def __init__(self, retry_last_ts: int, retry_interval: int, destination: str):
|
||||||
|
@ -169,6 +160,16 @@ class RetryDestinationLimiter:
|
||||||
self.notifier = notifier
|
self.notifier = notifier
|
||||||
self.replication_client = replication_client
|
self.replication_client = replication_client
|
||||||
|
|
||||||
|
self.destination_min_retry_interval_ms = (
|
||||||
|
self.store.hs.config.federation.destination_min_retry_interval_ms
|
||||||
|
)
|
||||||
|
self.destination_retry_multiplier = (
|
||||||
|
self.store.hs.config.federation.destination_retry_multiplier
|
||||||
|
)
|
||||||
|
self.destination_max_retry_interval_ms = (
|
||||||
|
self.store.hs.config.federation.destination_max_retry_interval_ms
|
||||||
|
)
|
||||||
|
|
||||||
def __enter__(self) -> None:
|
def __enter__(self) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -220,13 +221,15 @@ class RetryDestinationLimiter:
|
||||||
# We couldn't connect.
|
# We couldn't connect.
|
||||||
if self.retry_interval:
|
if self.retry_interval:
|
||||||
self.retry_interval = int(
|
self.retry_interval = int(
|
||||||
self.retry_interval * RETRY_MULTIPLIER * random.uniform(0.8, 1.4)
|
self.retry_interval
|
||||||
|
* self.destination_retry_multiplier
|
||||||
|
* random.uniform(0.8, 1.4)
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.retry_interval >= MAX_RETRY_INTERVAL:
|
if self.retry_interval >= self.destination_max_retry_interval_ms:
|
||||||
self.retry_interval = MAX_RETRY_INTERVAL
|
self.retry_interval = self.destination_max_retry_interval_ms
|
||||||
else:
|
else:
|
||||||
self.retry_interval = MIN_RETRY_INTERVAL
|
self.retry_interval = self.destination_min_retry_interval_ms
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"Connection to %s was unsuccessful (%s(%s)); backoff now %i",
|
"Connection to %s was unsuccessful (%s(%s)); backoff now %i",
|
||||||
|
|
|
@ -17,7 +17,6 @@ from twisted.test.proto_helpers import MemoryReactor
|
||||||
from synapse.server import HomeServer
|
from synapse.server import HomeServer
|
||||||
from synapse.storage.databases.main.transactions import DestinationRetryTimings
|
from synapse.storage.databases.main.transactions import DestinationRetryTimings
|
||||||
from synapse.util import Clock
|
from synapse.util import Clock
|
||||||
from synapse.util.retryutils import MAX_RETRY_INTERVAL
|
|
||||||
|
|
||||||
from tests.unittest import HomeserverTestCase
|
from tests.unittest import HomeserverTestCase
|
||||||
|
|
||||||
|
@ -57,8 +56,14 @@ class TransactionStoreTestCase(HomeserverTestCase):
|
||||||
self.get_success(d)
|
self.get_success(d)
|
||||||
|
|
||||||
def test_large_destination_retry(self) -> None:
|
def test_large_destination_retry(self) -> None:
|
||||||
|
max_retry_interval_ms = (
|
||||||
|
self.hs.config.federation.destination_max_retry_interval_ms
|
||||||
|
)
|
||||||
d = self.store.set_destination_retry_timings(
|
d = self.store.set_destination_retry_timings(
|
||||||
"example.com", MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL
|
"example.com",
|
||||||
|
max_retry_interval_ms,
|
||||||
|
max_retry_interval_ms,
|
||||||
|
max_retry_interval_ms,
|
||||||
)
|
)
|
||||||
self.get_success(d)
|
self.get_success(d)
|
||||||
|
|
||||||
|
|
|
@ -11,12 +11,7 @@
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
from synapse.util.retryutils import (
|
from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter
|
||||||
MIN_RETRY_INTERVAL,
|
|
||||||
RETRY_MULTIPLIER,
|
|
||||||
NotRetryingDestination,
|
|
||||||
get_retry_limiter,
|
|
||||||
)
|
|
||||||
|
|
||||||
from tests.unittest import HomeserverTestCase
|
from tests.unittest import HomeserverTestCase
|
||||||
|
|
||||||
|
@ -42,6 +37,11 @@ class RetryLimiterTestCase(HomeserverTestCase):
|
||||||
|
|
||||||
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
|
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
|
||||||
|
|
||||||
|
min_retry_interval_ms = (
|
||||||
|
self.hs.config.federation.destination_min_retry_interval_ms
|
||||||
|
)
|
||||||
|
retry_multiplier = self.hs.config.federation.destination_retry_multiplier
|
||||||
|
|
||||||
self.pump(1)
|
self.pump(1)
|
||||||
try:
|
try:
|
||||||
with limiter:
|
with limiter:
|
||||||
|
@ -57,7 +57,7 @@ class RetryLimiterTestCase(HomeserverTestCase):
|
||||||
assert new_timings is not None
|
assert new_timings is not None
|
||||||
self.assertEqual(new_timings.failure_ts, failure_ts)
|
self.assertEqual(new_timings.failure_ts, failure_ts)
|
||||||
self.assertEqual(new_timings.retry_last_ts, failure_ts)
|
self.assertEqual(new_timings.retry_last_ts, failure_ts)
|
||||||
self.assertEqual(new_timings.retry_interval, MIN_RETRY_INTERVAL)
|
self.assertEqual(new_timings.retry_interval, min_retry_interval_ms)
|
||||||
|
|
||||||
# now if we try again we should get a failure
|
# now if we try again we should get a failure
|
||||||
self.get_failure(
|
self.get_failure(
|
||||||
|
@ -68,7 +68,7 @@ class RetryLimiterTestCase(HomeserverTestCase):
|
||||||
# advance the clock and try again
|
# advance the clock and try again
|
||||||
#
|
#
|
||||||
|
|
||||||
self.pump(MIN_RETRY_INTERVAL)
|
self.pump(min_retry_interval_ms)
|
||||||
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
|
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
|
||||||
|
|
||||||
self.pump(1)
|
self.pump(1)
|
||||||
|
@ -87,16 +87,16 @@ class RetryLimiterTestCase(HomeserverTestCase):
|
||||||
self.assertEqual(new_timings.failure_ts, failure_ts)
|
self.assertEqual(new_timings.failure_ts, failure_ts)
|
||||||
self.assertEqual(new_timings.retry_last_ts, retry_ts)
|
self.assertEqual(new_timings.retry_last_ts, retry_ts)
|
||||||
self.assertGreaterEqual(
|
self.assertGreaterEqual(
|
||||||
new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 0.5
|
new_timings.retry_interval, min_retry_interval_ms * retry_multiplier * 0.5
|
||||||
)
|
)
|
||||||
self.assertLessEqual(
|
self.assertLessEqual(
|
||||||
new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0
|
new_timings.retry_interval, min_retry_interval_ms * retry_multiplier * 2.0
|
||||||
)
|
)
|
||||||
|
|
||||||
#
|
#
|
||||||
# one more go, with success
|
# one more go, with success
|
||||||
#
|
#
|
||||||
self.reactor.advance(MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0)
|
self.reactor.advance(min_retry_interval_ms * retry_multiplier * 2.0)
|
||||||
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
|
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
|
||||||
|
|
||||||
self.pump(1)
|
self.pump(1)
|
||||||
|
|
Loading…
Reference in New Issue