Merge pull request #222 from openzim/requests_timeout

benoit74 · web-flow · commit 974ac3703992 · 2024-11-25T09:27:11.000+01:00
Set default timeout in `download.stream_file` to 10 seconds, and allow to override value
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Add `filesystem.validate_folder_writable` to check if a folder can be written to #200
 
+### Fixed
+
+- Set default timeout in `download.stream_file` to 10 seconds, and allow to override value #222
+
 ## [4.0.0] - 2024-08-05
 
 ### Added
diff --git a/src/zimscraperlib/constants.py b/src/zimscraperlib/constants.py
@@ -56,3 +56,7 @@
 ILLUSTRATIONS_METADATA_RE = re.compile(
     r"^Illustration_(?P<height>\d+)x(?P<width>\d+)@(?P<scale>\d+)$"
 )
+
+# default timeout to get responses from upstream when doing web requests ; this is not
+# the total time it gets to download the whole resource
+DEFAULT_WEB_REQUESTS_TIMEOUT = 10
diff --git a/src/zimscraperlib/download.py b/src/zimscraperlib/download.py
@@ -15,6 +15,7 @@
 import yt_dlp as youtube_dl
 
 from zimscraperlib import logger
+from zimscraperlib.constants import DEFAULT_WEB_REQUESTS_TIMEOUT
 
 
 class YoutubeDownloader:
@@ -181,6 +182,7 @@ def stream_file(
     max_retries: int | None = 5,
     headers: dict[str, str] | None = None,
     session: requests.Session | None = None,
+    timeout: int | None = DEFAULT_WEB_REQUESTS_TIMEOUT,
     *,
     only_first_block: bool | None = False,
 ) -> tuple[int, requests.structures.CaseInsensitiveDict[str]]:
@@ -208,6 +210,7 @@ def stream_file(
         stream=True,
         proxies=proxies,
         headers=headers,
+        timeout=timeout,
     )
     resp.raise_for_status()
 
diff --git a/tests/download/test_download.py b/tests/download/test_download.py
@@ -14,6 +14,7 @@
 import requests.structures
 from yt_dlp import DownloadError
 
+from zimscraperlib.constants import DEFAULT_WEB_REQUESTS_TIMEOUT
 from zimscraperlib.download import (
     BestMp4,
     BestWebm,
@@ -22,13 +23,11 @@
     stream_file,
 )
 
-DEFAULT_REQUEST_TIMEOUT = 60
-
 
 def assert_downloaded_file(url, file):
     assert file.exists()
     # our google test urls dont support HEAD
-    req = requests.get(url, timeout=DEFAULT_REQUEST_TIMEOUT)
+    req = requests.get(url, timeout=DEFAULT_WEB_REQUESTS_TIMEOUT)
     # we test against binary response: Content-Length not accurate as gzip-encoded
     assert file.stat().st_size == len(req.content)
 
@@ -90,7 +89,11 @@ def test_first_block_download_custom_session(mocker, valid_http_url):
     )
     # check that custom session has been used
     custom_session.get.assert_called_once_with(
-        valid_http_url, stream=True, proxies=None, headers=None
+        valid_http_url,
+        stream=True,
+        proxies=None,
+        headers=None,
+        timeout=DEFAULT_WEB_REQUESTS_TIMEOUT,
     )
     requests.Session.assert_not_called()  # pyright: ignore
 
@@ -130,7 +133,7 @@ def test_stream_to_bytes(valid_https_url):
     assert_headers(ret)
     assert (
         byte_stream.read()
-        == requests.get(valid_https_url, timeout=DEFAULT_REQUEST_TIMEOUT).content
+        == requests.get(valid_https_url, timeout=DEFAULT_WEB_REQUESTS_TIMEOUT).content
     )
 
 

Original file line number	Diff line number	Diff line change
`@@ -56,3 +56,7 @@`
`56`	`56`	`ILLUSTRATIONS_METADATA_RE = re.compile(`
`57`	`57`	`r"^Illustration_(?P<height>\d+)x(?P<width>\d+)@(?P<scale>\d+)$"`
`58`	`58`	`)`
	`59`	`+`
	`60`	`+# default timeout to get responses from upstream when doing web requests ; this is not`
	`61`	`+# the total time it gets to download the whole resource`
	`62`	`+DEFAULT_WEB_REQUESTS_TIMEOUT = 10`