Skip to content

Commit e5d03e1

Browse files
authored
[WorkspaceStarter] Fail instance on all errors (#18745)
* [server] Tone down EntitlementService logs: error to warn * [server] Make sure to fail a WorkspaceInstance on any error * [WorspaceStarter] Don't fail on temporary gRPC errors
1 parent e916bf2 commit e5d03e1

File tree

5 files changed

+46
-42
lines changed

5 files changed

+46
-42
lines changed

components/gitpod-protocol/src/util/grpc.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,7 @@ export function createClientCallMetricsInterceptor(metrics: IClientCallMetrics):
105105
return new grpc.InterceptingCall(nextCall(options), requester);
106106
};
107107
}
108+
109+
export function isGrpcError(err: any): err is grpc.StatusObject {
110+
return err.code && err.details;
111+
}

components/server/src/billing/entitlement-service.ts

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ export class EntitlementServiceImpl implements EntitlementService {
122122
throw new Error("Unsupported billing mode: " + (billingMode as any).mode); // safety net
123123
}
124124
} catch (err) {
125-
log.error({ userId: user.id }, "EntitlementService error: mayStartWorkspace", err);
125+
log.warn({ userId: user.id }, "EntitlementService error: mayStartWorkspace", err);
126126
return {}; // When there is an EntitlementService error, we never want to break workspace starts
127127
}
128128
}
@@ -139,7 +139,7 @@ export class EntitlementServiceImpl implements EntitlementService {
139139
return this.ubp.maySetTimeout(userId, organizationId);
140140
}
141141
} catch (err) {
142-
log.error({ userId }, "EntitlementService error: maySetTimeout", err);
142+
log.warn({ userId }, "EntitlementService error: maySetTimeout", err);
143143
return true;
144144
}
145145
}
@@ -154,7 +154,7 @@ export class EntitlementServiceImpl implements EntitlementService {
154154
return this.ubp.getDefaultWorkspaceTimeout(userId, organizationId);
155155
}
156156
} catch (err) {
157-
log.error({ userId }, "EntitlementService error: getDefaultWorkspaceTimeout", err);
157+
log.warn({ userId }, "EntitlementService error: getDefaultWorkspaceTimeout", err);
158158
return WORKSPACE_TIMEOUT_DEFAULT_LONG;
159159
}
160160
}
@@ -169,7 +169,7 @@ export class EntitlementServiceImpl implements EntitlementService {
169169
return this.ubp.getDefaultWorkspaceLifetime(userId, organizationId);
170170
}
171171
} catch (err) {
172-
log.error({ userId }, "EntitlementService error: getDefaultWorkspaceLifetime", err);
172+
log.warn({ userId }, "EntitlementService error: getDefaultWorkspaceLifetime", err);
173173
return WORKSPACE_LIFETIME_LONG;
174174
}
175175
}
@@ -188,7 +188,7 @@ export class EntitlementServiceImpl implements EntitlementService {
188188
return this.ubp.limitNetworkConnections(userId, organizationId);
189189
}
190190
} catch (err) {
191-
log.error({ userId }, "EntitlementService error: limitNetworkConnections", err);
191+
log.warn({ userId }, "EntitlementService error: limitNetworkConnections", err);
192192
return false;
193193
}
194194
}
@@ -208,7 +208,7 @@ export class EntitlementServiceImpl implements EntitlementService {
208208
return billingMode.paid ? "paid" : "free";
209209
}
210210
} catch (err) {
211-
log.error({ userId }, "EntitlementService error: getBillingTier", err);
211+
log.warn({ userId }, "EntitlementService error: getBillingTier", err);
212212
return "paid";
213213
}
214214
}

components/server/src/prometheus-metrics.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,7 @@ export type FailedInstanceStartReason =
200200
| "clusterSelectionFailed"
201201
| "startOnClusterFailed"
202202
| "imageBuildFailed"
203+
| "imageBuildFailedUser"
203204
| "resourceExhausted"
204205
| "workspaceClusterMaintenance"
205206
| "other";

components/server/src/workspace/workspace-service.ts

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ import { goDurationToHumanReadable } from "@gitpod/gitpod-protocol/lib/util/time
6060
import { HeadlessLogEndpoint, HeadlessLogService } from "./headless-log-service";
6161
import { Deferred } from "@gitpod/gitpod-protocol/lib/util/deferred";
6262
import { OrganizationService } from "../orgs/organization-service";
63+
import { isGrpcError } from "@gitpod/gitpod-protocol/lib/util/grpc";
6364

6465
export interface StartWorkspaceOptions extends StarterStartWorkspaceOptions {
6566
/**
@@ -901,10 +902,6 @@ export class WorkspaceService {
901902

902903
// TODO(gpl) Make private after FGA rollout
903904
export function mapGrpcError(err: Error): Error {
904-
function isGrpcError(err: any): err is grpc.StatusObject {
905-
return err.code && err.details;
906-
}
907-
908905
if (!isGrpcError(err)) {
909906
return err;
910907
}

components/server/src/workspace/workspace-starter.ts

Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ import { EnvVarService, ResolvedEnvVars } from "../user/env-var-service";
132132
import { RedlockAbortSignal } from "redlock";
133133
import { getExperimentsClientForBackend } from "@gitpod/gitpod-protocol/lib/experiments/configcat-server";
134134
import { ConfigProvider } from "./config-provider";
135+
import { isGrpcError } from "@gitpod/gitpod-protocol/lib/util/grpc";
135136

136137
export interface StartWorkspaceOptions extends GitpodServer.StartWorkspaceOptions {
137138
excludeFeatureFlags?: NamedWorkspaceFeatureFlag[];
@@ -559,6 +560,7 @@ export class WorkspaceStarter {
559560
additionalAuth,
560561
forceRebuild,
561562
forceRebuild,
563+
abortSignal,
562564
region,
563565
);
564566

@@ -579,23 +581,23 @@ export class WorkspaceStarter {
579581
startRequest.setSpec(spec);
580582
startRequest.setServicePrefix(workspace.id);
581583

582-
if (instance.status.phase === "pending") {
583-
// due to the reconciliation loop we might have already started the workspace, especially in the "pending" phase
584-
const workspaceAlreadyExists = await this.existsWithWsManager(ctx, instance);
585-
if (workspaceAlreadyExists) {
586-
log.debug(
587-
{ instanceId: instance.id, workspaceId: instance.workspaceId },
588-
"workspace already exists, not starting again",
589-
{ phase: instance.status.phase },
590-
);
591-
return;
592-
}
593-
}
594-
595584
// choose a cluster and start the instance
596585
let resp: StartWorkspaceResponse.AsObject | undefined = undefined;
597586
let retries = 0;
598587
try {
588+
if (instance.status.phase === "pending") {
589+
// due to the reconciliation loop we might have already started the workspace, especially in the "pending" phase
590+
const workspaceAlreadyExists = await this.existsWithWsManager(ctx, instance);
591+
if (workspaceAlreadyExists) {
592+
log.debug(
593+
{ instanceId: instance.id, workspaceId: instance.workspaceId },
594+
"workspace already exists, not starting again",
595+
{ phase: instance.status.phase },
596+
);
597+
return;
598+
}
599+
}
600+
599601
for (; retries < MAX_INSTANCE_START_RETRIES; retries++) {
600602
if (abortSignal.aborted) {
601603
return;
@@ -659,6 +661,14 @@ export class WorkspaceStarter {
659661
});
660662
}
661663
} catch (err) {
664+
if (isGrpcError(err) && (err.code === grpc.status.UNAVAILABLE || err.code === grpc.status.ALREADY_EXISTS)) {
665+
// fall-through: we don't want to fail but retry/wait for future updates to resolve this
666+
} else if (!(err instanceof StartInstanceError)) {
667+
// fallback in case we did not already handle this error
668+
await this.failInstanceStart({ span }, err, workspace, instance, abortSignal);
669+
err = new StartInstanceError("other", err); // don't throw because there's nobody catching it. We just want to log/trace it.
670+
}
671+
662672
this.logAndTraceStartWorkspaceError({ span }, logCtx, err);
663673
} finally {
664674
if (abortSignal.aborted) {
@@ -811,8 +821,9 @@ export class WorkspaceStarter {
811821
// We may have never actually started the workspace which means that ws-manager-bridge never set a workspace status.
812822
// We have to set that status ourselves.
813823
instance.status.phase = "stopped";
814-
instance.stoppingTime = new Date().toISOString();
815-
instance.stoppedTime = new Date().toISOString();
824+
const now = new Date().toISOString();
825+
instance.stoppingTime = now;
826+
instance.stoppedTime = now;
816827

817828
instance.status.conditions.failed = err.toString();
818829
instance.status.message = `Workspace cannot be started: ${err}`;
@@ -1201,6 +1212,7 @@ export class WorkspaceStarter {
12011212
additionalAuth: Map<string, string>,
12021213
ignoreBaseImageresolvedAndRebuildBase: boolean = false,
12031214
forceRebuild: boolean = false,
1215+
abortSignal: RedlockAbortSignal,
12041216
region?: WorkspaceRegion,
12051217
): Promise<WorkspaceInstance> {
12061218
const span = TraceContext.startSpan("buildWorkspaceImage", ctx);
@@ -1302,6 +1314,7 @@ export class WorkspaceStarter {
13021314
additionalAuth,
13031315
true,
13041316
forceRebuild,
1317+
abortSignal,
13051318
region,
13061319
);
13071320
} else {
@@ -1338,24 +1351,8 @@ export class WorkspaceStarter {
13381351
}
13391352

13401353
// This instance's image build "failed" as well, so mark it as such.
1341-
const now = new Date().toISOString();
1342-
instance = await this.workspaceDb.trace({ span }).updateInstancePartial(instance.id, {
1343-
status: { ...instance.status, phase: "stopped", conditions: { failed: message }, message },
1344-
stoppedTime: now,
1345-
stoppingTime: now,
1346-
});
1347-
1348-
// Mark the PrebuildWorkspace as failed
1349-
await this.failPrebuildWorkspace({ span }, err, workspace);
1354+
await this.failInstanceStart({ span }, err, workspace, instance, abortSignal);
13501355

1351-
// Publish updated workspace instance
1352-
await this.publisher.publishInstanceUpdate({
1353-
workspaceID: workspace.ownerId,
1354-
instanceID: instance.id,
1355-
ownerID: workspace.ownerId,
1356-
});
1357-
1358-
TraceContext.setError({ span }, err);
13591356
const looksLikeUserError = (msg: string): boolean => {
13601357
return msg.startsWith("build failed:") || msg.includes("headless task failed:");
13611358
};
@@ -1365,6 +1362,8 @@ export class WorkspaceStarter {
13651362
`workspace image build failed: ${message}`,
13661363
{ looksLikeUserError: true },
13671364
);
1365+
err = new StartInstanceError("imageBuildFailedUser", err);
1366+
// Don't report this as "failed" to our metrics as it would trigger an alert
13681367
} else {
13691368
log.error(
13701369
{ instanceId: instance.id, userId: user.id, workspaceId: workspace.id },
@@ -1963,6 +1962,9 @@ export class WorkspaceStarter {
19631962
await client.describeWorkspace(ctx, req);
19641963
return true;
19651964
} catch (err) {
1965+
if (isClusterMaintenanceError(err)) {
1966+
throw err;
1967+
}
19661968
return false;
19671969
}
19681970
}

0 commit comments

Comments
 (0)