Skip to content

Commit d4be614

Browse files
committed
[telemetry] Detect service rechability issues
Summary: Introducing a counter for number of "rechability" issues for a service that can detect a service is down or un-responsive by visualizing `rate(restate.invoker.service_unreachable_errors.total)` coupled with alerts, operator can know when a service is facing connectivity problems
1 parent 3bfdcc9 commit d4be614

File tree

3 files changed

+44
-3
lines changed

3 files changed

+44
-3
lines changed

crates/invoker-impl/src/lib.rs

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,14 @@ use tracing::{error, instrument};
5454

5555
use crate::error::SdkInvocationErrorV2;
5656
use crate::metric_definitions::{
57-
INVOKER_ENQUEUE, INVOKER_INVOCATION_TASKS, TASK_OP_COMPLETED, TASK_OP_FAILED, TASK_OP_STARTED,
58-
TASK_OP_SUSPENDED,
57+
INVOKER_ENQUEUE, INVOKER_INVOCATION_TASKS, INVOKER_SERVICE_UNREACHABLE_ERRORS,
58+
TASK_OP_COMPLETED, TASK_OP_FAILED, TASK_OP_STARTED, TASK_OP_SUSPENDED,
5959
};
6060
use error::InvokerError;
6161
pub use input_command::ChannelStatusReader;
6262
pub use input_command::InvokerHandle;
6363
use restate_invoker_api::invocation_reader::InvocationReader;
64-
use restate_service_client::{AssumeRoleCacheMode, ServiceClient};
64+
use restate_service_client::{AssumeRoleCacheMode, HttpError, ServiceClient, ServiceClientError};
6565
use restate_types::deployment::PinnedDeployment;
6666
use restate_types::invocation::{InvocationEpoch, InvocationTarget};
6767
use restate_types::journal_v2;
@@ -1090,6 +1090,11 @@ where
10901090
.remove_invocation_with_epoch(partition, &invocation_id, invocation_epoch)
10911091
{
10921092
debug_assert_eq!(invocation_epoch, ism.invocation_epoch);
1093+
1094+
if self.is_service_down_error(&error) {
1095+
counter!(INVOKER_SERVICE_UNREACHABLE_ERRORS, "service" => ism.invocation_target.service_name().to_string()).increment(1);
1096+
}
1097+
10931098
self.handle_error_event(options, partition, invocation_id, error, ism)
10941099
.await;
10951100
} else {
@@ -1098,6 +1103,22 @@ where
10981103
}
10991104
}
11001105

1106+
fn is_service_down_error(&self, error: &InvokerError) -> bool {
1107+
if let InvokerError::Client(client_err) = error {
1108+
match client_err.as_ref() {
1109+
ServiceClientError::Http(_, HttpError::Connect(_)) => true,
1110+
ServiceClientError::Lambda(_, error) => {
1111+
// service down errors are those which might indicate that the service is down or
1112+
// unreachable.
1113+
error.is_service_down()
1114+
}
1115+
_ => false,
1116+
}
1117+
} else {
1118+
false
1119+
}
1120+
}
1121+
11011122
#[instrument(
11021123
level = "trace",
11031124
skip_all,

crates/invoker-impl/src/metric_definitions.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ pub const INVOKER_TASK_DURATION: &str = "restate.invoker.task_duration.seconds";
2121
pub const INVOKER_SERVICE_RESPONSE_TIME: &str = "restate.invoker.service_response_time.seconds";
2222
pub const INVOKER_TASKS_IN_FLIGHT: &str = "restate.invoker.inflight_tasks";
2323
pub const INVOKER_JOURNAL_REPLAY_TIME: &str = "restate.invoker.journal_replay_time.seconds";
24+
pub const INVOKER_SERVICE_UNREACHABLE_ERRORS: &str =
25+
"restate.invoker.service_unreachable_errors.total";
2426

2527
pub const TASK_OP_STARTED: &str = "started";
2628
pub const TASK_OP_SUSPENDED: &str = "suspended";
@@ -81,4 +83,10 @@ pub(crate) fn describe_metrics() {
8183
Unit::Count,
8284
"Number of inflight invoker tasks"
8385
);
86+
87+
describe_counter!(
88+
INVOKER_SERVICE_UNREACHABLE_ERRORS,
89+
Unit::Count,
90+
"Number of service down errors"
91+
);
8492
}

crates/service-client/src/lambda.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,18 @@ pub enum LambdaError {
277277
}
278278

279279
impl LambdaError {
280+
/// Service down errors are those which might indicate that the service is down or
281+
/// unreachable.
282+
pub fn is_service_down(&self) -> bool {
283+
match self {
284+
LambdaError::SdkError(err) => matches!(
285+
err.as_ref(),
286+
SdkError::DispatchFailure(_) | SdkError::TimeoutError(_)
287+
),
288+
_ => false,
289+
}
290+
}
291+
280292
/// Retryable errors are those which can be caused by transient faults and where
281293
/// retrying can succeed.
282294
pub fn is_retryable(&self) -> bool {

0 commit comments

Comments
 (0)