From ba6fa1a700c7b3208e6dda3eee49ece4c53d0b01 Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Fri, 31 Jul 2020 18:59:28 -0700 Subject: [PATCH 1/3] zephyr: Use exponential backoffs in retry loops. This reduces the number of retries that might spam APIs. There is some complexity here which is left un-managed -- for instance, maybe_restart_mirroring_script does a number of restart attempts, and then fails, but will be retried every 15s by the surrounding `process_loop`. Previously, it would merely have looped forever inside maybe_restart_mirroring_script. Three loops are intentionally left as infinite `while True` loops, that merely cap their backoff at the default 90s. Their callers do not expect, or have any way to handle more gracefully, a failure of the expected-infinite-loop in `process_loop` or `zulip_to_zephyr`. They maintain their previous behavior of retrying forever, albeit more slowly. --- .../zephyr/zephyr_mirror_backend.py | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/zulip/integrations/zephyr/zephyr_mirror_backend.py b/zulip/integrations/zephyr/zephyr_mirror_backend.py index 8b5197118..326c58876 100755 --- a/zulip/integrations/zephyr/zephyr_mirror_backend.py +++ b/zulip/integrations/zephyr/zephyr_mirror_backend.py @@ -17,6 +17,8 @@ import tempfile import select +from zulip import RandomExponentialBackoff + DEFAULT_SITE = "https://api.zulip.com" class States: @@ -218,32 +220,41 @@ def maybe_restart_mirroring_script() -> None: except OSError: # We don't care whether we failed to cancel subs properly, but we should log it logger.exception("") - while True: + backoff = RandomExponentialBackoff( + maximum_retries=3, + ) + while backoff.keep_going(): try: os.execvp(os.path.abspath(__file__), sys.argv) + # No need for backoff.succeed, since this can't be reached except Exception: logger.exception("Error restarting mirroring script; trying again... Traceback:") - time.sleep(1) + backoff.fail() + raise Exception("Failed to reload too many times, aborting!") def process_loop(log: Optional[IO[Any]]) -> None: restart_check_count = 0 last_check_time = time.time() + recieve_backoff = RandomExponentialBackoff() while True: select.select([zephyr._z.getFD()], [], [], 15) try: + process_backoff = RandomExponentialBackoff() # Fetch notices from the queue until its empty while True: notice = zephyr.receive(block=False) + recieve_backoff.succeed() if notice is None: break try: process_notice(notice, log) + process_backoff.succeed() except Exception: logger.exception("Error relaying zephyr:") - time.sleep(2) + process_backoff.fail() except Exception: logger.exception("Error checking for new zephyrs:") - time.sleep(1) + recieve_backoff.fail() continue if time.time() - last_check_time > 15: @@ -759,12 +770,13 @@ def maybe_forward_to_zephyr(message: Dict[str, Any]) -> None: def zulip_to_zephyr(options: int) -> None: # Sync messages from zulip to zephyr logger.info("Starting syncing messages.") + backoff = RandomExponentialBackoff(timeout_success_equivalent=120) while True: try: zulip_client.call_on_each_message(maybe_forward_to_zephyr) except Exception: logger.exception("Error syncing messages:") - time.sleep(1) + backoff.fail() def subscribed_to_mail_messages() -> bool: # In case we have lost our AFS tokens and those won't be able to From 739b927cf53a39620207849cfbeb78c1ed3220c7 Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Fri, 31 Jul 2020 19:00:35 -0700 Subject: [PATCH 2/3] zephyr: Mark the infinite loops as NoReturn. --- zulip/integrations/zephyr/zephyr_mirror_backend.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/zulip/integrations/zephyr/zephyr_mirror_backend.py b/zulip/integrations/zephyr/zephyr_mirror_backend.py index 326c58876..9d0b575a6 100755 --- a/zulip/integrations/zephyr/zephyr_mirror_backend.py +++ b/zulip/integrations/zephyr/zephyr_mirror_backend.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -from typing import Any, Dict, IO, List, Optional, Set, Text, Tuple, cast +from typing import Any, Dict, IO, List, NoReturn, Optional, Set, Text, Tuple, cast from types import FrameType import sys @@ -232,7 +232,7 @@ def maybe_restart_mirroring_script() -> None: backoff.fail() raise Exception("Failed to reload too many times, aborting!") -def process_loop(log: Optional[IO[Any]]) -> None: +def process_loop(log: Optional[IO[Any]]) -> NoReturn: restart_check_count = 0 last_check_time = time.time() recieve_backoff = RandomExponentialBackoff() @@ -767,7 +767,7 @@ def maybe_forward_to_zephyr(message: Dict[str, Any]) -> None: # whole process logger.exception("Error forwarding message:") -def zulip_to_zephyr(options: int) -> None: +def zulip_to_zephyr(options: int) -> NoReturn: # Sync messages from zulip to zephyr logger.info("Starting syncing messages.") backoff = RandomExponentialBackoff(timeout_success_equivalent=120) @@ -1146,7 +1146,6 @@ def die_gracefully(signal: int, frame: FrameType) -> None: # Run the zulip => zephyr mirror in the child configure_logger(logger, "zulip=>zephyr") zulip_to_zephyr(options) - sys.exit(0) else: child_pid = None CURRENT_STATE = States.ZephyrToZulip From 331d78c2cf40da83b6b5ce88d308eea6bfa07fd2 Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Mon, 3 Aug 2020 13:04:29 -0700 Subject: [PATCH 3/3] doc: Document the backoff class better. --- zulip/zulip/__init__.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/zulip/zulip/__init__.py b/zulip/zulip/__init__.py index 94e4be07d..f61935be1 100644 --- a/zulip/zulip/__init__.py +++ b/zulip/zulip/__init__.py @@ -33,7 +33,27 @@ API_VERSTRING = "v1/" class CountingBackoff: - def __init__(self, maximum_retries: int = 10, timeout_success_equivalent: Optional[float] = None, delay_cap: float = 90.0) -> None: + def __init__( + self, + maximum_retries: int = 10, + timeout_success_equivalent: Optional[float] = None, + delay_cap: float = 90.0, + ) -> None: + """Sets up a retry-backoff object. Example usage: + backoff = zulip.CountingBackoff() + while backoff.keep_going(): + try: + something() + backoff.succeed() + except Exception: + backoff.fail() + + timeout_success_equivalent is used in cases where 'success' is + never possible to determine automatically; it sets the + threshold in seconds before the next keep_going/fail, above + which the last run is treated like it was a success. + + """ self.number_of_retries = 0 self.maximum_retries = maximum_retries self.timeout_success_equivalent = timeout_success_equivalent