Skip to content

Commit b23ff53

Browse files
authored
Retry requests on failure to hopefully avoid the sporadic failures we've been seeing (#56)
2 parents b54ee70 + 4cf37af commit b23ff53

File tree

3 files changed

+117
-12
lines changed

3 files changed

+117
-12
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ serde = { version = "1.0.219", features = ["derive"] }
3939
serde_json = "1.0.141"
4040
tera = { version = "1.20.0", default-features = false, features = [ "urlencode" ] }
4141
thiserror = { version = "2.0.12", default-features = false }
42-
tokio = { version = "1.47.0", features = ["macros", "rt-multi-thread"] }
42+
tokio = { version = "1.47.0", features = ["macros", "rt-multi-thread", "test-util"] }
4343
toml = { version = "0.9.2", default-features = false, features = ["preserve_order", "parse", "serde", "std"] }
4444
tower-http = { version = "0.6.6", features = ["cors", "fs", "catch-panic", "trace"] }
4545
tracing = "0.1.41"

src/checking.rs

Lines changed: 115 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use std::{
1111
LazyLock, Mutex,
1212
atomic::{AtomicBool, Ordering},
1313
},
14+
time::Duration,
1415
};
1516

1617
use axum::{
@@ -22,7 +23,7 @@ use futures::{Stream, TryStreamExt};
2223
use reqwest::{Client, Response, StatusCode};
2324
use sarlacc::Intern;
2425
use tokio::sync::RwLock;
25-
use tracing::{error, info};
26+
use tracing::{error, info, instrument};
2627

2728
use crate::{discord::Snowflake, webring::CheckLevel};
2829

@@ -32,7 +33,16 @@ use crate::{discord::Snowflake, webring::CheckLevel};
3233
const WEBRING_CHANNEL: Snowflake = Snowflake::new(1319140464812753009);
3334

3435
/// The time in milliseconds for which the server is considered online after a successful ping.
35-
static ONLINE_CHECK_TTL_MS: i64 = 1000;
36+
const ONLINE_CHECK_TTL_MS: i64 = 1000;
37+
38+
/// The timeout to retry requesting a site after failure
39+
const RETRY_TIMEOUT: Duration = Duration::from_secs(5);
40+
41+
/// How many times to attempt to retry a connection after failure
42+
const RETRY_COUNT: usize = 5;
43+
44+
/// How long requests will wait before failing due to timing out
45+
const REQUEST_TIMEOUT: Duration = Duration::from_secs(30);
3646

3747
/// The HTTP client used to make requests to the webring sites for validation.
3848
static CLIENT: LazyLock<Client> = LazyLock::new(|| {
@@ -43,6 +53,7 @@ static CLIENT: LazyLock<Client> = LazyLock::new(|| {
4353
env!("CARGO_PKG_VERSION"),
4454
env!("CARGO_PKG_REPOSITORY")
4555
))
56+
.timeout(REQUEST_TIMEOUT)
4657
.build()
4758
.expect("Creating the HTTP client should not fail")
4859
});
@@ -143,6 +154,7 @@ pub async fn check(
143154
///
144155
/// If the site fails any check, returns `Some(CheckFailure)`.
145156
/// If the site passes all checks, returns `None`.
157+
#[instrument(skip(base_address))]
146158
async fn check_impl(
147159
website: &Uri,
148160
check_level: CheckLevel,
@@ -152,14 +164,40 @@ async fn check_impl(
152164
return None;
153165
}
154166

155-
let response = match if check_level == CheckLevel::ForLinks {
156-
CLIENT.get(website.to_string()).send().await
157-
} else {
158-
CLIENT.head(website.to_string()).send().await
159-
} {
160-
Ok(response) => response,
167+
let mut response;
168+
169+
let mut retry_limit = RETRY_COUNT;
170+
171+
loop {
172+
response = if check_level == CheckLevel::ForLinks {
173+
CLIENT.get(website.to_string()).send().await
174+
} else {
175+
CLIENT.head(website.to_string()).send().await
176+
};
177+
178+
if retry_limit == 0 {
179+
break;
180+
}
181+
182+
match &response {
183+
Ok(_) => break,
184+
Err(err) => {
185+
info!(
186+
site = %website, %err, delay = ?RETRY_TIMEOUT, "Error requesting site; retrying after delay"
187+
);
188+
}
189+
}
190+
191+
retry_limit -= 1;
192+
193+
tokio::time::sleep(RETRY_TIMEOUT).await;
194+
}
195+
196+
let response = match response {
197+
Ok(v) => v,
161198
Err(err) => return Some(CheckFailure::Connection(err)),
162199
};
200+
163201
mark_server_as_online().await;
164202
let successful_response = match response.error_for_status() {
165203
Ok(r) => r,
@@ -505,13 +543,29 @@ async fn scan_for_links(
505543

506544
#[cfg(test)]
507545
mod tests {
508-
use axum::{Router, body::Bytes, http::Uri, response::Html, routing::get};
546+
use std::{
547+
sync::{
548+
Arc,
549+
atomic::{AtomicBool, AtomicU64, Ordering},
550+
},
551+
time::Duration,
552+
};
553+
554+
use axum::{
555+
Router,
556+
body::{Body, Bytes},
557+
http::Uri,
558+
response::{Html, Response},
559+
routing::get,
560+
};
509561
use futures::stream;
510562
use indoc::formatdoc;
511563
use pretty_assertions::assert_eq;
512564
use reqwest::StatusCode;
513565
use sarlacc::Intern;
514566

567+
use crate::checking::REQUEST_TIMEOUT;
568+
515569
use super::{
516570
CheckFailure, CheckLevel, LinkStatus, LinkStatuses, WEBRING_CHANNEL, check, scan_for_links,
517571
};
@@ -826,7 +880,7 @@ mod tests {
826880
assert_eq!(expected, links.to_message());
827881
}
828882

829-
#[tokio::test]
883+
#[tokio::test(start_paused = true)]
830884
async fn check_failure_types() {
831885
// Start a web server so we can do each kinds of checks
832886
let server_addr = ("127.0.0.1", 32750);
@@ -907,6 +961,57 @@ mod tests {
907961
}
908962
}
909963

964+
#[tokio::test(start_paused = true)]
965+
async fn test_retrying() {
966+
// Start a web server that fails only the first request
967+
let server_addr = ("127.0.0.1", 32752);
968+
969+
let ok_hits = Arc::new(AtomicU64::new(0));
970+
let err_hits = Arc::new(AtomicU64::new(0));
971+
let already_requested = Arc::new(AtomicBool::new(false));
972+
973+
let ok_hits_for_server = Arc::clone(&ok_hits);
974+
let err_hits_for_server = Arc::clone(&err_hits);
975+
tokio::spawn(async move {
976+
let listener = tokio::net::TcpListener::bind(&server_addr).await.unwrap();
977+
let router = Router::new().route(
978+
"/up",
979+
get(async move || {
980+
if already_requested.swap(true, Ordering::Relaxed) {
981+
ok_hits_for_server.fetch_add(1, Ordering::Relaxed);
982+
Response::builder()
983+
.status(200)
984+
.body(Body::from("Hi there!"))
985+
.unwrap()
986+
} else {
987+
err_hits_for_server.fetch_add(1, Ordering::Relaxed);
988+
// Trigger the request timeout
989+
tokio::time::sleep(Duration::from_secs(1) + REQUEST_TIMEOUT).await;
990+
Response::builder()
991+
.status(500)
992+
.body(Body::from("Retry plz!"))
993+
.unwrap()
994+
}
995+
}),
996+
);
997+
axum::serve(listener, router).await.unwrap();
998+
});
999+
1000+
let base = Intern::new(Uri::from_static("https://ring.purduehackers.com"));
1001+
1002+
let maybe_failure = super::check(
1003+
&Uri::from_static("http://127.0.0.1:32752/up"),
1004+
CheckLevel::JustOnline,
1005+
base,
1006+
)
1007+
.await
1008+
.unwrap();
1009+
1010+
assert!(maybe_failure.is_none(), "{maybe_failure:?}");
1011+
assert_eq!(err_hits.load(Ordering::Relaxed), 1);
1012+
assert_eq!(ok_hits.load(Ordering::Relaxed), 1);
1013+
}
1014+
9101015
#[tokio::test]
9111016
#[ignore = "Kian's site could go down"]
9121017
async fn kians_site() {

src/webring.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -734,7 +734,7 @@ mod tests {
734734
}
735735

736736
#[expect(clippy::too_many_lines)]
737-
#[tokio::test]
737+
#[tokio::test(start_paused = true)]
738738
async fn test_webring() {
739739
let config = make_config();
740740
let webring = Webring::new(&config);

0 commit comments

Comments
 (0)