From ba6fa1a700c7b3208e6dda3eee49ece4c53d0b01 Mon Sep 17 00:00:00 2001
From: Alex Vandiver <alexmv@zulip.com>
Date: Fri, 31 Jul 2020 18:59:28 -0700
Subject: [PATCH 1/3] zephyr: Use exponential backoffs in retry loops.

This reduces the number of retries that might spam APIs.

There is some complexity here which is left un-managed -- for
instance, maybe_restart_mirroring_script does a number of restart
attempts, and then fails, but will be retried every 15s by the
surrounding `process_loop`.  Previously, it would merely have looped
forever inside maybe_restart_mirroring_script.

Three loops are intentionally left as infinite `while True` loops,
that merely cap their backoff at the default 90s.  Their callers do
not expect, or have any way to handle more gracefully, a failure of
the expected-infinite-loop in `process_loop` or `zulip_to_zephyr`.
They maintain their previous behavior of retrying forever, albeit more
slowly.
---
 .../zephyr/zephyr_mirror_backend.py           | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/zulip/integrations/zephyr/zephyr_mirror_backend.py b/zulip/integrations/zephyr/zephyr_mirror_backend.py
index 8b5197118..326c58876 100755
--- a/zulip/integrations/zephyr/zephyr_mirror_backend.py
+++ b/zulip/integrations/zephyr/zephyr_mirror_backend.py
@@ -17,6 +17,8 @@
 import tempfile
 import select
 
+from zulip import RandomExponentialBackoff
+
 DEFAULT_SITE = "https://api.zulip.com"
 
 class States:
@@ -218,32 +220,41 @@ def maybe_restart_mirroring_script() -> None:
         except OSError:
             # We don't care whether we failed to cancel subs properly, but we should log it
             logger.exception("")
-        while True:
+        backoff = RandomExponentialBackoff(
+            maximum_retries=3,
+        )
+        while backoff.keep_going():
             try:
                 os.execvp(os.path.abspath(__file__), sys.argv)
+                # No need for backoff.succeed, since this can't be reached
             except Exception:
                 logger.exception("Error restarting mirroring script; trying again... Traceback:")
-                time.sleep(1)
+                backoff.fail()
+        raise Exception("Failed to reload too many times, aborting!")
 
 def process_loop(log: Optional[IO[Any]]) -> None:
     restart_check_count = 0
     last_check_time = time.time()
+    recieve_backoff = RandomExponentialBackoff()
     while True:
         select.select([zephyr._z.getFD()], [], [], 15)
         try:
+            process_backoff = RandomExponentialBackoff()
             # Fetch notices from the queue until its empty
             while True:
                 notice = zephyr.receive(block=False)
+                recieve_backoff.succeed()
                 if notice is None:
                     break
                 try:
                     process_notice(notice, log)
+                    process_backoff.succeed()
                 except Exception:
                     logger.exception("Error relaying zephyr:")
-                    time.sleep(2)
+                    process_backoff.fail()
         except Exception:
             logger.exception("Error checking for new zephyrs:")
-            time.sleep(1)
+            recieve_backoff.fail()
             continue
 
         if time.time() - last_check_time > 15:
@@ -759,12 +770,13 @@ def maybe_forward_to_zephyr(message: Dict[str, Any]) -> None:
 def zulip_to_zephyr(options: int) -> None:
     # Sync messages from zulip to zephyr
     logger.info("Starting syncing messages.")
+    backoff = RandomExponentialBackoff(timeout_success_equivalent=120)
     while True:
         try:
             zulip_client.call_on_each_message(maybe_forward_to_zephyr)
         except Exception:
             logger.exception("Error syncing messages:")
-            time.sleep(1)
+            backoff.fail()
 
 def subscribed_to_mail_messages() -> bool:
     # In case we have lost our AFS tokens and those won't be able to

From 739b927cf53a39620207849cfbeb78c1ed3220c7 Mon Sep 17 00:00:00 2001
From: Alex Vandiver <alexmv@zulip.com>
Date: Fri, 31 Jul 2020 19:00:35 -0700
Subject: [PATCH 2/3] zephyr: Mark the infinite loops as NoReturn.

---
 zulip/integrations/zephyr/zephyr_mirror_backend.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/zulip/integrations/zephyr/zephyr_mirror_backend.py b/zulip/integrations/zephyr/zephyr_mirror_backend.py
index 326c58876..9d0b575a6 100755
--- a/zulip/integrations/zephyr/zephyr_mirror_backend.py
+++ b/zulip/integrations/zephyr/zephyr_mirror_backend.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-from typing import Any, Dict, IO, List, Optional, Set, Text, Tuple, cast
+from typing import Any, Dict, IO, List, NoReturn, Optional, Set, Text, Tuple, cast
 from types import FrameType
 
 import sys
@@ -232,7 +232,7 @@ def maybe_restart_mirroring_script() -> None:
                 backoff.fail()
         raise Exception("Failed to reload too many times, aborting!")
 
-def process_loop(log: Optional[IO[Any]]) -> None:
+def process_loop(log: Optional[IO[Any]]) -> NoReturn:
     restart_check_count = 0
     last_check_time = time.time()
     recieve_backoff = RandomExponentialBackoff()
@@ -767,7 +767,7 @@ def maybe_forward_to_zephyr(message: Dict[str, Any]) -> None:
             # whole process
             logger.exception("Error forwarding message:")
 
-def zulip_to_zephyr(options: int) -> None:
+def zulip_to_zephyr(options: int) -> NoReturn:
     # Sync messages from zulip to zephyr
     logger.info("Starting syncing messages.")
     backoff = RandomExponentialBackoff(timeout_success_equivalent=120)
@@ -1146,7 +1146,6 @@ def die_gracefully(signal: int, frame: FrameType) -> None:
             # Run the zulip => zephyr mirror in the child
             configure_logger(logger, "zulip=>zephyr")
             zulip_to_zephyr(options)
-            sys.exit(0)
     else:
         child_pid = None
     CURRENT_STATE = States.ZephyrToZulip

From 331d78c2cf40da83b6b5ce88d308eea6bfa07fd2 Mon Sep 17 00:00:00 2001
From: Alex Vandiver <alexmv@zulip.com>
Date: Mon, 3 Aug 2020 13:04:29 -0700
Subject: [PATCH 3/3] doc: Document the backoff class better.

---
 zulip/zulip/__init__.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/zulip/zulip/__init__.py b/zulip/zulip/__init__.py
index 94e4be07d..f61935be1 100644
--- a/zulip/zulip/__init__.py
+++ b/zulip/zulip/__init__.py
@@ -33,7 +33,27 @@
 API_VERSTRING = "v1/"
 
 class CountingBackoff:
-    def __init__(self, maximum_retries: int = 10, timeout_success_equivalent: Optional[float] = None, delay_cap: float = 90.0) -> None:
+    def __init__(
+        self,
+        maximum_retries: int = 10,
+        timeout_success_equivalent: Optional[float] = None,
+        delay_cap: float = 90.0,
+    ) -> None:
+        """Sets up a retry-backoff object.  Example usage:
+        backoff = zulip.CountingBackoff()
+        while backoff.keep_going():
+            try:
+                something()
+                backoff.succeed()
+            except Exception:
+                backoff.fail()
+
+        timeout_success_equivalent is used in cases where 'success' is
+        never possible to determine automatically; it sets the
+        threshold in seconds before the next keep_going/fail, above
+        which the last run is treated like it was a success.
+
+        """
         self.number_of_retries = 0
         self.maximum_retries = maximum_retries
         self.timeout_success_equivalent = timeout_success_equivalent