Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1584,6 +1584,13 @@ spec:
If this field is unspecified or false, a new pod will be created to replace
the evicted one.
type: boolean
error_on_termination:
description: |-
ErrorOnTermination indicates that the ProwJob should be completed and given
the ErrorState status if the pod that is executing the job is terminated.
If this field is unspecified or false, a new pod will be created to replace
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm still concerned about making this the default behavior. I don't know if Kubernetes has the bandwidth to dig through all of our jobs and identify which should have this enabled safely or not.

I would generally expect a breaking behavior change to be opt-in

the terminated one.
type: boolean
extra_refs:
description: |-
ExtraRefs are auxiliary repositories that
Expand Down
5 changes: 5 additions & 0 deletions pkg/apis/prowjobs/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,11 @@ type ProwJobSpec struct {
// If this field is unspecified or false, a new pod will be created to replace
// the evicted one.
ErrorOnEviction bool `json:"error_on_eviction,omitempty"`
// ErrorOnTermination indicates that the ProwJob should be completed and given
// the ErrorState status if the pod that is executing the job is terminated.
// If this field is unspecified or false, a new pod will be created to replace
// the terminated one.
ErrorOnTermination bool `json:"error_on_termination,omitempty"`

// PodSpec provides the basis for running the test under
// a Kubernetes agent
Expand Down
22 changes: 22 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,13 @@ type Plank struct {
// Defaults to 3. A value of 0 means no retries.
MaxRevivals *int `json:"max_revivals,omitempty"`

// TerminationConditionReasons is a set of pod status condition reasons on which the
// controller will match to determine if the pod's node is being terminated. If a node is being
// terminated the controller will restart the prowjob, unless the ErrorOnTermination option
// is set on the prowjob or the MaxRevivals option is reached.
// Defaults to ["DeletionByPodGC", "DeletionByGCPControllerManager"].
TerminationConditionReasons []string `json:"termination_condition_reasons,omitempty"`

// DefaultDecorationConfigs holds the default decoration config for specific values.
//
// Each entry in the slice specifies Repo and Cluster regexp filter fields to
Expand Down Expand Up @@ -2514,6 +2521,19 @@ func parseProwConfig(c *Config) error {
c.Plank.MaxRevivals = &maxRetries
}

if c.Plank.TerminationConditionReasons == nil {
c.Plank.TerminationConditionReasons = []string{
// If the node does no longer exist and the pod gets garbage collected,
// this condition will be set:
// https://kubernetes.io/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions
"DeletionByPodGC",
// On GCP, before a new spot instance is started, the old pods are garbage
// collected (if they have not been already by the Kubernetes PodGC):
// https://github.com/kubernetes/cloud-provider-gcp/blob/25e5dcc715781316bc5e39f8b17c0d5b313453f7/cmd/gcp-controller-manager/node_csr_approver.go#L1035-L1058
"DeletionByGCPControllerManager",
}
}

if err := c.Gerrit.DefaultAndValidate(); err != nil {
return fmt.Errorf("validating gerrit config: %w", err)
}
Expand Down Expand Up @@ -2913,6 +2933,8 @@ func validateAgent(v JobBase, podNamespace string) error {
return fmt.Errorf("decoration requires agent: %s (found %q)", k, agent)
case v.ErrorOnEviction && agent != k:
return fmt.Errorf("error_on_eviction only applies to agent: %s (found %q)", k, agent)
case v.ErrorOnTermination && agent != k:
return fmt.Errorf("error_on_termination only applies to agent: %s (found %q)", k, agent)
case v.Namespace == nil || *v.Namespace == "":
return fmt.Errorf("failed to default namespace")
case *v.Namespace != podNamespace && agent != p:
Expand Down
19 changes: 19 additions & 0 deletions pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1428,6 +1428,13 @@ func TestValidateAgent(t *testing.T) {
},
pass: true,
},
{
name: "error_on_termination allowed for kubernetes agent",
base: func(j *JobBase) {
j.ErrorOnTermination = true
},
pass: true,
},
}

for _, tc := range cases {
Expand Down Expand Up @@ -8430,6 +8437,9 @@ plank:
pod_pending_timeout: 10m0s
pod_running_timeout: 48h0m0s
pod_unscheduled_timeout: 5m0s
termination_condition_reasons:
- DeletionByPodGC
- DeletionByGCPControllerManager
pod_namespace: default
prowjob_namespace: default
push_gateway:
Expand Down Expand Up @@ -8515,6 +8525,9 @@ plank:
pod_pending_timeout: 10m0s
pod_running_timeout: 48h0m0s
pod_unscheduled_timeout: 5m0s
termination_condition_reasons:
- DeletionByPodGC
- DeletionByGCPControllerManager
pod_namespace: default
prowjob_namespace: default
push_gateway:
Expand Down Expand Up @@ -8593,6 +8606,9 @@ plank:
pod_pending_timeout: 10m0s
pod_running_timeout: 48h0m0s
pod_unscheduled_timeout: 5m0s
termination_condition_reasons:
- DeletionByPodGC
- DeletionByGCPControllerManager
pod_namespace: default
prowjob_namespace: default
push_gateway:
Expand Down Expand Up @@ -8676,6 +8692,9 @@ plank:
pod_pending_timeout: 10m0s
pod_running_timeout: 48h0m0s
pod_unscheduled_timeout: 5m0s
termination_condition_reasons:
- DeletionByPodGC
- DeletionByGCPControllerManager
pod_namespace: default
prowjob_namespace: default
push_gateway:
Expand Down
5 changes: 5 additions & 0 deletions pkg/config/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ type JobBase struct {
// If this field is unspecified or false, a new pod will be created to replace
// the evicted one.
ErrorOnEviction bool `json:"error_on_eviction,omitempty"`
// ErrorOnTermination indicates that the ProwJob should be completed and given
// the ErrorState status if the pod that is executing the job is terminated.
// If this field is unspecified or false, a new pod will be created to replace
// the terminated one.
ErrorOnTermination bool `json:"error_on_termination,omitempty"`
// SourcePath contains the path where this job is defined
SourcePath string `json:"-"`
// Spec is the Kubernetes pod spec used if Agent is kubernetes.
Expand Down
7 changes: 7 additions & 0 deletions pkg/config/prow-config-documented.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1321,6 +1321,13 @@ plank:
# Use `org/repo`, `org` or `*` as a key.
report_templates:
"": ""
# TerminationConditionReasons is a set of pod status condition reasons on which the
# controller will match to determine if the pod's node is being terminated. If a node is being
# terminated the controller will restart the prowjob, unless the ErrorOnTermination option
# is set on the prowjob or the MaxRevivals option is reached.
# Defaults to ["DeletionByPodGC", "DeletionByGCPControllerManager"].
termination_condition_reasons:
- ""
# PodNamespace is the namespace in the cluster that prow
# components will use for looking up Pods owned by ProwJobs.
# The namespace needs to exist and will not be created by prow.
Expand Down
13 changes: 7 additions & 6 deletions pkg/pjutil/pjutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,12 +228,13 @@ func specFromJobBase(jb config.JobBase) prowapi.ProwJobSpec {
namespace = *jb.Namespace
}
return prowapi.ProwJobSpec{
Job: jb.Name,
Agent: prowapi.ProwJobAgent(jb.Agent),
Cluster: jb.Cluster,
Namespace: namespace,
MaxConcurrency: jb.MaxConcurrency,
ErrorOnEviction: jb.ErrorOnEviction,
Job: jb.Name,
Agent: prowapi.ProwJobAgent(jb.Agent),
Cluster: jb.Cluster,
Namespace: namespace,
MaxConcurrency: jb.MaxConcurrency,
ErrorOnEviction: jb.ErrorOnEviction,
ErrorOnTermination: jb.ErrorOnTermination,

ExtraRefs: DecorateExtraRefs(jb.ExtraRefs, jb),
DecorationConfig: jb.DecorationConfig,
Expand Down
87 changes: 76 additions & 11 deletions pkg/plank/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ const (
podDeletionPreventionFinalizer = "keep-from-vanishing"
)

var maxRevivals = 3
var (
maxRevivals = 3
terminationConditionReasons = []string{"DeletionByPodGC", "DeletionByGCPControllerManager"}
)

func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[string]int) *fca {
presubmits := []config.Presubmit{
Expand Down Expand Up @@ -104,11 +107,12 @@ func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[st
MaxConcurrency: maxConcurrency,
MaxGoroutines: 20,
},
JobQueueCapacities: queueCapacities,
PodPendingTimeout: &metav1.Duration{Duration: podPendingTimeout},
PodRunningTimeout: &metav1.Duration{Duration: podRunningTimeout},
PodUnscheduledTimeout: &metav1.Duration{Duration: podUnscheduledTimeout},
MaxRevivals: &maxRevivals,
JobQueueCapacities: queueCapacities,
PodPendingTimeout: &metav1.Duration{Duration: podPendingTimeout},
PodRunningTimeout: &metav1.Duration{Duration: podRunningTimeout},
PodUnscheduledTimeout: &metav1.Duration{Duration: podUnscheduledTimeout},
MaxRevivals: &maxRevivals,
TerminationConditionReasons: terminationConditionReasons,
},
},
JobConfig: config.JobConfig{
Expand Down Expand Up @@ -1217,10 +1221,7 @@ func TestSyncPendingJob(t *testing.T) {
ExpectedURL: "boop-42/error",
},
{
// TODO: this test case tests the current behavior, but the behavior
// is non-ideal: the pod execution did not fail, instead the node on which
// the pod was running terminated
Name: "a terminated pod is handled as-if it failed",
Name: "delete terminated pod",
PJ: prowapi.ProwJob{
ObjectMeta: metav1.ObjectMeta{
Name: "boop-42",
Expand All @@ -1246,8 +1247,72 @@ func TestSyncPendingJob(t *testing.T) {
},
},
},
ExpectedComplete: false,
ExpectedState: prowapi.PendingState,
ExpectedNumPods: 0,
},
{
Name: "delete terminated pod and remove its k8sreporter finalizer",
PJ: prowapi.ProwJob{
ObjectMeta: metav1.ObjectMeta{
Name: "boop-42",
Namespace: "prowjobs",
},
Spec: prowapi.ProwJobSpec{
PodSpec: &v1.PodSpec{Containers: []v1.Container{{Name: "test-name", Env: []v1.EnvVar{}}}},
},
Status: prowapi.ProwJobStatus{
State: prowapi.PendingState,
PodName: "boop-42",
},
},
Pods: []v1.Pod{
{
ObjectMeta: metav1.ObjectMeta{
Name: "boop-42",
Namespace: "pods",
Finalizers: []string{"prow.x-k8s.io/gcsk8sreporter"},
},
Status: v1.PodStatus{
Phase: v1.PodFailed,
Reason: Terminated,
},
},
},
ExpectedComplete: false,
ExpectedState: prowapi.PendingState,
ExpectedNumPods: 0,
},
{
Name: "don't delete terminated pod w/ error_on_termination, complete PJ instead",
PJ: prowapi.ProwJob{
ObjectMeta: metav1.ObjectMeta{
Name: "boop-42",
Namespace: "prowjobs",
},
Spec: prowapi.ProwJobSpec{
ErrorOnTermination: true,
PodSpec: &v1.PodSpec{Containers: []v1.Container{{Name: "test-name", Env: []v1.EnvVar{}}}},
},
Status: prowapi.ProwJobStatus{
State: prowapi.PendingState,
PodName: "boop-42",
},
},
Pods: []v1.Pod{
{
ObjectMeta: metav1.ObjectMeta{
Name: "boop-42",
Namespace: "pods",
},
Status: v1.PodStatus{
Phase: v1.PodFailed,
Reason: Terminated,
},
},
},
ExpectedComplete: true,
ExpectedState: prowapi.FailureState,
ExpectedState: prowapi.ErrorState,
ExpectedNumPods: 1,
ExpectedURL: "boop-42/error",
},
Expand Down
25 changes: 23 additions & 2 deletions pkg/plank/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"encoding/json"
"errors"
"fmt"
"slices"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -469,14 +470,20 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
pj.Status.PodName = pn
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is missing, starting a new pod")
}
} else if podUnexpectedStopCause := getPodUnexpectedStopCause(pod); podUnexpectedStopCause != PodUnexpectedStopCauseNone {
} else if podUnexpectedStopCause := getPodUnexpectedStopCause(pod, r.config().Plank.TerminationConditionReasons); podUnexpectedStopCause != PodUnexpectedStopCauseNone {
switch {
case podUnexpectedStopCause == PodUnexpectedStopCauseEvicted && pj.Spec.ErrorOnEviction:
// ErrorOnEviction is enabled, complete the PJ and mark it as errored.
r.log.WithField("error-on-eviction", true).WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got evicted, fail job.")
pj.SetComplete()
pj.Status.State = prowv1.ErrorState
pj.Status.Description = "Job pod was evicted by the cluster."
case podUnexpectedStopCause == PodUnexpectedStopCauseTerminated && pj.Spec.ErrorOnTermination:
// ErrorOnTermination is enabled, complete the PJ and mark it as errored.
r.log.WithField("error-on-termination", true).WithFields(pjutil.ProwJobFields(pj)).Info("Pods Node got terminated, fail job.")
pj.SetComplete()
pj.Status.State = prowv1.ErrorState
pj.Status.Description = "Job pod's node was terminated."
case pj.Status.PodRevivalCount >= *r.config().Plank.MaxRevivals:
// MaxRevivals is reached, complete the PJ and mark it as errored.
r.log.WithField("unexpected-stop-cause", podUnexpectedStopCause).WithFields(pjutil.ProwJobFields(pj)).Info("Pod Node reached max retries, fail job.")
Expand Down Expand Up @@ -661,14 +668,28 @@ const (
PodUnexpectedStopCauseNone PodUnexpectedStopCause = ""
PodUnexpectedStopCauseUnknown PodUnexpectedStopCause = "unknown"
PodUnexpectedStopCauseEvicted PodUnexpectedStopCause = "evicted"
PodUnexpectedStopCauseTerminated PodUnexpectedStopCause = "terminated"
PodUnexpectedStopCauseUnreachable PodUnexpectedStopCause = "unreachable"
)

func getPodUnexpectedStopCause(pod *corev1.Pod) PodUnexpectedStopCause {
func getPodUnexpectedStopCause(pod *corev1.Pod, terminationConditionReasons []string) PodUnexpectedStopCause {
if pod.Status.Reason == Evicted {
return PodUnexpectedStopCauseEvicted
}

// If there was a Graceful node shutdown, the Pod's status will have a
// reason set to "Terminated":
// https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
if pod.Status.Reason == Terminated {
return PodUnexpectedStopCauseTerminated
}

for _, condition := range pod.Status.Conditions {
if slices.Contains(terminationConditionReasons, condition.Reason) {
return PodUnexpectedStopCauseTerminated
}
}

if pod.Status.Reason == NodeUnreachablePodReason && pod.DeletionTimestamp != nil {
return PodUnexpectedStopCauseUnreachable
}
Expand Down