@@ -132,6 +132,7 @@ import { EnvVarService, ResolvedEnvVars } from "../user/env-var-service";
132
132
import { RedlockAbortSignal } from "redlock" ;
133
133
import { getExperimentsClientForBackend } from "@gitpod/gitpod-protocol/lib/experiments/configcat-server" ;
134
134
import { ConfigProvider } from "./config-provider" ;
135
+ import { isGrpcError } from "@gitpod/gitpod-protocol/lib/util/grpc" ;
135
136
136
137
export interface StartWorkspaceOptions extends GitpodServer . StartWorkspaceOptions {
137
138
excludeFeatureFlags ?: NamedWorkspaceFeatureFlag [ ] ;
@@ -559,6 +560,7 @@ export class WorkspaceStarter {
559
560
additionalAuth ,
560
561
forceRebuild ,
561
562
forceRebuild ,
563
+ abortSignal ,
562
564
region ,
563
565
) ;
564
566
@@ -579,23 +581,23 @@ export class WorkspaceStarter {
579
581
startRequest . setSpec ( spec ) ;
580
582
startRequest . setServicePrefix ( workspace . id ) ;
581
583
582
- if ( instance . status . phase === "pending" ) {
583
- // due to the reconciliation loop we might have already started the workspace, especially in the "pending" phase
584
- const workspaceAlreadyExists = await this . existsWithWsManager ( ctx , instance ) ;
585
- if ( workspaceAlreadyExists ) {
586
- log . debug (
587
- { instanceId : instance . id , workspaceId : instance . workspaceId } ,
588
- "workspace already exists, not starting again" ,
589
- { phase : instance . status . phase } ,
590
- ) ;
591
- return ;
592
- }
593
- }
594
-
595
584
// choose a cluster and start the instance
596
585
let resp : StartWorkspaceResponse . AsObject | undefined = undefined ;
597
586
let retries = 0 ;
598
587
try {
588
+ if ( instance . status . phase === "pending" ) {
589
+ // due to the reconciliation loop we might have already started the workspace, especially in the "pending" phase
590
+ const workspaceAlreadyExists = await this . existsWithWsManager ( ctx , instance ) ;
591
+ if ( workspaceAlreadyExists ) {
592
+ log . debug (
593
+ { instanceId : instance . id , workspaceId : instance . workspaceId } ,
594
+ "workspace already exists, not starting again" ,
595
+ { phase : instance . status . phase } ,
596
+ ) ;
597
+ return ;
598
+ }
599
+ }
600
+
599
601
for ( ; retries < MAX_INSTANCE_START_RETRIES ; retries ++ ) {
600
602
if ( abortSignal . aborted ) {
601
603
return ;
@@ -659,6 +661,14 @@ export class WorkspaceStarter {
659
661
} ) ;
660
662
}
661
663
} catch ( err ) {
664
+ if ( isGrpcError ( err ) && ( err . code === grpc . status . UNAVAILABLE || err . code === grpc . status . ALREADY_EXISTS ) ) {
665
+ // fall-through: we don't want to fail but retry/wait for future updates to resolve this
666
+ } else if ( ! ( err instanceof StartInstanceError ) ) {
667
+ // fallback in case we did not already handle this error
668
+ await this . failInstanceStart ( { span } , err , workspace , instance , abortSignal ) ;
669
+ err = new StartInstanceError ( "other" , err ) ; // don't throw because there's nobody catching it. We just want to log/trace it.
670
+ }
671
+
662
672
this . logAndTraceStartWorkspaceError ( { span } , logCtx , err ) ;
663
673
} finally {
664
674
if ( abortSignal . aborted ) {
@@ -811,8 +821,9 @@ export class WorkspaceStarter {
811
821
// We may have never actually started the workspace which means that ws-manager-bridge never set a workspace status.
812
822
// We have to set that status ourselves.
813
823
instance . status . phase = "stopped" ;
814
- instance . stoppingTime = new Date ( ) . toISOString ( ) ;
815
- instance . stoppedTime = new Date ( ) . toISOString ( ) ;
824
+ const now = new Date ( ) . toISOString ( ) ;
825
+ instance . stoppingTime = now ;
826
+ instance . stoppedTime = now ;
816
827
817
828
instance . status . conditions . failed = err . toString ( ) ;
818
829
instance . status . message = `Workspace cannot be started: ${ err } ` ;
@@ -1201,6 +1212,7 @@ export class WorkspaceStarter {
1201
1212
additionalAuth : Map < string , string > ,
1202
1213
ignoreBaseImageresolvedAndRebuildBase : boolean = false ,
1203
1214
forceRebuild : boolean = false ,
1215
+ abortSignal : RedlockAbortSignal ,
1204
1216
region ?: WorkspaceRegion ,
1205
1217
) : Promise < WorkspaceInstance > {
1206
1218
const span = TraceContext . startSpan ( "buildWorkspaceImage" , ctx ) ;
@@ -1302,6 +1314,7 @@ export class WorkspaceStarter {
1302
1314
additionalAuth ,
1303
1315
true ,
1304
1316
forceRebuild ,
1317
+ abortSignal ,
1305
1318
region ,
1306
1319
) ;
1307
1320
} else {
@@ -1338,24 +1351,8 @@ export class WorkspaceStarter {
1338
1351
}
1339
1352
1340
1353
// This instance's image build "failed" as well, so mark it as such.
1341
- const now = new Date ( ) . toISOString ( ) ;
1342
- instance = await this . workspaceDb . trace ( { span } ) . updateInstancePartial ( instance . id , {
1343
- status : { ...instance . status , phase : "stopped" , conditions : { failed : message } , message } ,
1344
- stoppedTime : now ,
1345
- stoppingTime : now ,
1346
- } ) ;
1347
-
1348
- // Mark the PrebuildWorkspace as failed
1349
- await this . failPrebuildWorkspace ( { span } , err , workspace ) ;
1354
+ await this . failInstanceStart ( { span } , err , workspace , instance , abortSignal ) ;
1350
1355
1351
- // Publish updated workspace instance
1352
- await this . publisher . publishInstanceUpdate ( {
1353
- workspaceID : workspace . ownerId ,
1354
- instanceID : instance . id ,
1355
- ownerID : workspace . ownerId ,
1356
- } ) ;
1357
-
1358
- TraceContext . setError ( { span } , err ) ;
1359
1356
const looksLikeUserError = ( msg : string ) : boolean => {
1360
1357
return msg . startsWith ( "build failed:" ) || msg . includes ( "headless task failed:" ) ;
1361
1358
} ;
@@ -1365,6 +1362,8 @@ export class WorkspaceStarter {
1365
1362
`workspace image build failed: ${ message } ` ,
1366
1363
{ looksLikeUserError : true } ,
1367
1364
) ;
1365
+ err = new StartInstanceError ( "imageBuildFailedUser" , err ) ;
1366
+ // Don't report this as "failed" to our metrics as it would trigger an alert
1368
1367
} else {
1369
1368
log . error (
1370
1369
{ instanceId : instance . id , userId : user . id , workspaceId : workspace . id } ,
@@ -1963,6 +1962,9 @@ export class WorkspaceStarter {
1963
1962
await client . describeWorkspace ( ctx , req ) ;
1964
1963
return true ;
1965
1964
} catch ( err ) {
1965
+ if ( isClusterMaintenanceError ( err ) ) {
1966
+ throw err ;
1967
+ }
1966
1968
return false ;
1967
1969
}
1968
1970
}
0 commit comments