Skip to content

Commit c17ddd1

Browse files
committed
[telemetry] Detect service reachability issues
Summary: Introducing a counter for number of "rechability" issues for a service that can detect a service is down or un-responsive by visualizing `rate(restate.invoker.service_unreachable_errors.total)` coupled with alerts, operator can know when a service is facing connectivity problems
1 parent 3fbe382 commit c17ddd1

File tree

4 files changed

+81
-34
lines changed

4 files changed

+81
-34
lines changed

crates/invoker-impl/src/error.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ use tokio::task::JoinError;
3333
pub struct InvokerError {
3434
pub kind: InvokerErrorKind,
3535
// Deployment ID associated with the error, if any.
36-
#[allow(dead_code)]
3736
pub deployment_id: Option<DeploymentId>,
3837
}
3938

crates/invoker-impl/src/lib.rs

Lines changed: 61 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -17,64 +17,67 @@ mod quota;
1717
mod state_machine_manager;
1818
mod status_store;
1919

20-
use input_command::{InputCommand, InvokeCommand};
21-
use invocation_state_machine::InvocationStateMachine;
22-
use invocation_task::InvocationTask;
23-
use invocation_task::{InvocationTaskOutput, InvocationTaskOutputInner};
24-
use metric_definitions::{INVOKER_PENDING_TASKS, INVOKER_TASKS_IN_FLIGHT};
25-
use metrics::{counter, gauge};
26-
use restate_core::cancellation_watcher;
27-
use restate_errors::warn_it;
28-
use restate_invoker_api::{
29-
Effect, EffectKind, EntryEnricher, InvocationErrorReport, InvocationStatusReport,
30-
InvokeInputJournal,
31-
};
32-
use restate_queue::SegmentQueue;
33-
use restate_timer_queue::TimerQueue;
34-
use restate_types::config::{InvokerOptions, ServiceClientOptions};
35-
use restate_types::identifiers::PartitionLeaderEpoch;
36-
use restate_types::identifiers::{DeploymentId, InvocationId, PartitionKey, WithPartitionKey};
37-
use restate_types::journal::enriched::EnrichedRawEntry;
38-
use restate_types::journal::{Completion, EntryIndex};
39-
use restate_types::live::{Live, LiveLoad};
40-
use restate_types::retries::RetryPolicy;
41-
use restate_types::schema::deployment::DeploymentResolver;
42-
use status_store::InvocationStatusStore;
20+
use std::borrow::Cow;
4321
use std::collections::{HashMap, HashSet};
4422
use std::future::Future;
4523
use std::ops::RangeInclusive;
4624
use std::path::PathBuf;
4725
use std::pin::Pin;
4826
use std::time::SystemTime;
4927
use std::{cmp, panic};
28+
29+
use metrics::{counter, gauge};
5030
use tokio::sync::mpsc;
5131
use tokio::task::{AbortHandle, JoinSet};
5232
use tracing::{debug, trace};
5333
use tracing::{error, instrument};
5434

55-
use crate::error::{InvokerError, SdkInvocationErrorV2};
56-
use crate::metric_definitions::{
57-
INVOKER_ENQUEUE, INVOKER_INVOCATION_TASKS, TASK_OP_COMPLETED, TASK_OP_FAILED, TASK_OP_STARTED,
58-
TASK_OP_SUSPENDED,
59-
};
60-
use error::InvokerErrorKind;
61-
pub use input_command::ChannelStatusReader;
62-
pub use input_command::InvokerHandle;
35+
use restate_core::cancellation_watcher;
36+
use restate_errors::warn_it;
6337
use restate_invoker_api::invocation_reader::InvocationReader;
64-
use restate_service_client::{AssumeRoleCacheMode, ServiceClient};
38+
use restate_invoker_api::{
39+
Effect, EffectKind, EntryEnricher, InvocationErrorReport, InvocationStatusReport,
40+
InvokeInputJournal,
41+
};
42+
use restate_queue::SegmentQueue;
43+
use restate_service_client::{AssumeRoleCacheMode, HttpError, ServiceClient, ServiceClientError};
44+
use restate_timer_queue::TimerQueue;
45+
use restate_types::config::{InvokerOptions, ServiceClientOptions};
6546
use restate_types::deployment::PinnedDeployment;
47+
use restate_types::identifiers::PartitionLeaderEpoch;
48+
use restate_types::identifiers::{DeploymentId, InvocationId, PartitionKey, WithPartitionKey};
6649
use restate_types::invocation::{InvocationEpoch, InvocationTarget};
50+
use restate_types::journal::enriched::EnrichedRawEntry;
51+
use restate_types::journal::{Completion, EntryIndex};
6752
use restate_types::journal_v2;
6853
use restate_types::journal_v2::raw::{
6954
RawCommand, RawEntry, RawEntryHeader, RawEvent, RawNotification,
7055
};
7156
use restate_types::journal_v2::{
7257
CommandIndex, EntryMetadata, Event, NotificationId, TransientErrorEvent,
7358
};
59+
use restate_types::live::{Live, LiveLoad};
60+
use restate_types::retries::RetryPolicy;
61+
use restate_types::schema::deployment::DeploymentResolver;
7462
use restate_types::schema::invocation_target::InvocationTargetResolver;
7563
use restate_types::schema::service::ServiceMetadataResolver;
7664
use restate_types::service_protocol::ServiceProtocolVersion;
7765

66+
use crate::error::{InvokerError, SdkInvocationErrorV2};
67+
use crate::metric_definitions::{
68+
INVOKER_DEPLOYMENT_UNREACHABLE_ERRORS, INVOKER_ENQUEUE, INVOKER_INVOCATION_TASKS,
69+
TASK_OP_COMPLETED, TASK_OP_FAILED, TASK_OP_STARTED, TASK_OP_SUSPENDED,
70+
};
71+
use error::InvokerErrorKind;
72+
pub use input_command::ChannelStatusReader;
73+
pub use input_command::InvokerHandle;
74+
use input_command::{InputCommand, InvokeCommand};
75+
use invocation_state_machine::InvocationStateMachine;
76+
use invocation_task::InvocationTask;
77+
use invocation_task::{InvocationTaskOutput, InvocationTaskOutputInner};
78+
use metric_definitions::{INVOKER_PENDING_TASKS, INVOKER_TASKS_IN_FLIGHT};
79+
use status_store::InvocationStatusStore;
80+
7881
#[derive(Debug, Clone, PartialEq, Eq)]
7982
pub(crate) enum Notification {
8083
Completion(Completion),
@@ -1090,6 +1093,15 @@ where
10901093
.remove_invocation_with_epoch(partition, &invocation_id, invocation_epoch)
10911094
{
10921095
debug_assert_eq!(invocation_epoch, ism.invocation_epoch);
1096+
1097+
if self.is_service_down_error(&error.kind) {
1098+
let deployment_id = error
1099+
.deployment_id
1100+
.map(|id| Cow::Owned(id.to_string()))
1101+
.unwrap_or_else(|| Cow::Borrowed("unknown"));
1102+
counter!(INVOKER_DEPLOYMENT_UNREACHABLE_ERRORS, "service" => ism.invocation_target.service_name().to_string(), "deployment" => deployment_id).increment(1);
1103+
}
1104+
10931105
self.handle_error_event(options, partition, invocation_id, error.kind, ism)
10941106
.await;
10951107
} else {
@@ -1098,6 +1110,22 @@ where
10981110
}
10991111
}
11001112

1113+
fn is_service_down_error(&self, error: &InvokerErrorKind) -> bool {
1114+
if let InvokerErrorKind::Client(client_err) = error {
1115+
match client_err.as_ref() {
1116+
ServiceClientError::Http(_, HttpError::Connect(_)) => true,
1117+
ServiceClientError::Lambda(_, error) => {
1118+
// service down errors are those which might indicate that the service is down or
1119+
// unreachable.
1120+
error.is_service_down()
1121+
}
1122+
_ => false,
1123+
}
1124+
} else {
1125+
false
1126+
}
1127+
}
1128+
11011129
#[instrument(
11021130
level = "trace",
11031131
skip_all,
@@ -1436,7 +1464,7 @@ mod tests {
14361464
use restate_types::schema::invocation_target::InvocationTargetMetadata;
14371465
use restate_types::schema::service::{InvocationAttemptOptions, ServiceMetadata};
14381466

1439-
use crate::error::{InvokerErrorKind, SdkInvocationErrorV2};
1467+
use crate::error::SdkInvocationErrorV2;
14401468
use crate::quota::InvokerConcurrencyQuota;
14411469

14421470
// -- Mocks

crates/invoker-impl/src/metric_definitions.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ pub const INVOKER_AVAILABLE_SLOTS: &str = "restate.invoker.available_slots";
1919
pub const INVOKER_CONCURRENCY_LIMIT: &str = "restate.invoker.concurrency_limit";
2020
pub const INVOKER_TASK_DURATION: &str = "restate.invoker.task_duration.seconds";
2121
pub const INVOKER_TASKS_IN_FLIGHT: &str = "restate.invoker.inflight_tasks";
22+
pub const INVOKER_DEPLOYMENT_UNREACHABLE_ERRORS: &str =
23+
"restate.invoker.deployment_unreachable_errors.total";
2224

2325
pub const TASK_OP_STARTED: &str = "started";
2426
pub const TASK_OP_SUSPENDED: &str = "suspended";
@@ -67,4 +69,10 @@ pub(crate) fn describe_metrics() {
6769
Unit::Count,
6870
"Number of inflight invoker tasks"
6971
);
72+
73+
describe_counter!(
74+
INVOKER_DEPLOYMENT_UNREACHABLE_ERRORS,
75+
Unit::Count,
76+
"Number of deployment down errors"
77+
);
7078
}

crates/service-client/src/lambda.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,18 @@ pub enum LambdaError {
277277
}
278278

279279
impl LambdaError {
280+
/// Service down errors are those which might indicate that the service is down or
281+
/// unreachable.
282+
pub fn is_service_down(&self) -> bool {
283+
match self {
284+
LambdaError::SdkError(err) => matches!(
285+
err.as_ref(),
286+
SdkError::DispatchFailure(_) | SdkError::TimeoutError(_)
287+
),
288+
_ => false,
289+
}
290+
}
291+
280292
/// Retryable errors are those which can be caused by transient faults and where
281293
/// retrying can succeed.
282294
pub fn is_retryable(&self) -> bool {

0 commit comments

Comments
 (0)