@@ -3,10 +3,12 @@ package olm
33import (
44 "context"
55 "fmt"
6+ "time"
67
78 hashutil "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubernetes/pkg/util/hash"
89 log "github.com/sirupsen/logrus"
910 appsv1 "k8s.io/api/apps/v1"
11+ corev1 "k8s.io/api/core/v1"
1012 apierrors "k8s.io/apimachinery/pkg/api/errors"
1113 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1214 "k8s.io/apimachinery/pkg/labels"
@@ -22,6 +24,11 @@ import (
2224const (
2325 // Name of packageserver API service
2426 PackageserverName = "v1.packages.operators.coreos.com"
27+
28+ // maxDisruptionDuration is the maximum duration we consider pod disruption as "expected"
29+ // (e.g., during cluster upgrade). Beyond this time, we treat the unavailability as a real failure.
30+ // This prevents indefinitely waiting for pods that will never recover.
31+ maxDisruptionDuration = 5 * time .Minute
2532)
2633
2734// apiServiceResourceErrorActionable returns true if OLM can do something about any one
@@ -168,6 +175,168 @@ func (a *Operator) checkAPIServiceResources(csv *v1alpha1.ClusterServiceVersion,
168175 return utilerrors .NewAggregate (errs )
169176}
170177
178+ // isAPIServiceBackendDisrupted checks if the APIService is unavailable due to expected pod disruption
179+ // (e.g., during node reboot or cluster upgrade) rather than an actual failure.
180+ // According to the Progressing condition contract, operators should not report Progressing=True
181+ // only because pods are adjusting to new nodes or rebooting during cluster upgrade.
182+ func (a * Operator ) isAPIServiceBackendDisrupted (csv * v1alpha1.ClusterServiceVersion , apiServiceName string ) bool {
183+ // Get the deployment that backs this APIService
184+ // For most APIServices, the deployment name matches the CSV name or is specified in the CSV
185+
186+ // Try to find the deployment from the CSV's install strategy
187+ strategy , err := a .resolver .UnmarshalStrategy (csv .Spec .InstallStrategy )
188+ if err != nil {
189+ a .logger .Debugf ("Unable to unmarshal strategy for CSV %s: %v" , csv .Name , err )
190+ return false
191+ }
192+
193+ strategyDetailsDeployment , ok := strategy .(* v1alpha1.StrategyDetailsDeployment )
194+ if ! ok {
195+ a .logger .Debugf ("CSV %s does not use deployment strategy" , csv .Name )
196+ return false
197+ }
198+
199+ // Check each deployment's pods
200+ // Note: We check all deployments in the CSV rather than trying to identify
201+ // the specific deployment backing this APIService. This is because:
202+ // 1. Mapping APIService -> Service -> Deployment requires complex logic
203+ // 2. During cluster upgrades, all deployments in the CSV are likely affected
204+ // 3. The time limit and failure detection logic prevents false positives
205+ for _ , deploymentSpec := range strategyDetailsDeployment .DeploymentSpecs {
206+ deployment , err := a .lister .AppsV1 ().DeploymentLister ().Deployments (csv .Namespace ).Get (deploymentSpec .Name )
207+ if err != nil {
208+ if apierrors .IsNotFound (err ) {
209+ continue
210+ }
211+ a .logger .Debugf ("Error getting deployment %s: %v" , deploymentSpec .Name , err )
212+ continue
213+ }
214+
215+ // Check if deployment is being updated or rolling out
216+ if deployment .Status .UnavailableReplicas > 0 ||
217+ deployment .Status .UpdatedReplicas < deployment .Status .Replicas {
218+ a .logger .Debugf ("Deployment %s has unavailable replicas, likely due to pod disruption" , deploymentSpec .Name )
219+
220+ // Check pod status to confirm disruption
221+ selector , err := metav1 .LabelSelectorAsSelector (deployment .Spec .Selector )
222+ if err != nil {
223+ a .logger .Debugf ("Error parsing deployment selector: %v" , err )
224+ continue
225+ }
226+
227+ pods , err := a .lister .CoreV1 ().PodLister ().Pods (csv .Namespace ).List (selector )
228+ if err != nil {
229+ a .logger .Debugf ("Error listing pods: %v" , err )
230+ continue
231+ }
232+
233+ // Check if any pod is in expected disruption state
234+ for _ , pod := range pods {
235+ // Check how long the pod has been in disrupted state
236+ // If it's been too long, it's likely a real failure, not temporary disruption
237+ podAge := time .Since (pod .CreationTimestamp .Time )
238+ if podAge > maxDisruptionDuration {
239+ a .logger .Debugf ("Pod %s has been in disrupted state for %v (exceeds %v) - treating as real failure" ,
240+ pod .Name , podAge , maxDisruptionDuration )
241+ continue
242+ }
243+
244+ // Pod is terminating (DeletionTimestamp is set)
245+ if pod .DeletionTimestamp != nil {
246+ a .logger .Debugf ("Pod %s is terminating - expected disruption" , pod .Name )
247+ return true
248+ }
249+
250+ // For pending pods, we need to distinguish between expected disruption
251+ // (being scheduled/created during node drain) and real failures (ImagePullBackOff, etc.)
252+ if pod .Status .Phase == corev1 .PodPending {
253+ // Check if this is a real failure vs expected disruption
254+ isExpectedDisruption := false
255+ isRealFailure := false
256+
257+ // Check pod conditions for scheduling issues
258+ for _ , condition := range pod .Status .Conditions {
259+ if condition .Type == corev1 .PodScheduled && condition .Status == corev1 .ConditionFalse {
260+ // If pod has been unschedulable for a while, it's likely a real issue
261+ // not a temporary disruption from cluster upgrade
262+ if condition .Reason == "Unschedulable" {
263+ isRealFailure = true
264+ a .logger .Debugf ("Pod %s is unschedulable - not a temporary disruption" , pod .Name )
265+ break
266+ }
267+ }
268+ }
269+
270+ // Check container statuses for real failures
271+ for _ , containerStatus := range pod .Status .ContainerStatuses {
272+ if containerStatus .State .Waiting != nil {
273+ reason := containerStatus .State .Waiting .Reason
274+ switch reason {
275+ case "ContainerCreating" , "PodInitializing" :
276+ // These are expected during normal pod startup
277+ isExpectedDisruption = true
278+ case "ImagePullBackOff" , "ErrImagePull" , "CrashLoopBackOff" , "CreateContainerConfigError" , "InvalidImageName" :
279+ // These are real failures, not temporary disruptions
280+ isRealFailure = true
281+ a .logger .Debugf ("Pod %s has container error %s - real failure, not disruption" , pod .Name , reason )
282+ }
283+ }
284+ }
285+
286+ // Also check init container statuses
287+ for _ , containerStatus := range pod .Status .InitContainerStatuses {
288+ if containerStatus .State .Waiting != nil {
289+ reason := containerStatus .State .Waiting .Reason
290+ switch reason {
291+ case "ContainerCreating" , "PodInitializing" :
292+ isExpectedDisruption = true
293+ case "ImagePullBackOff" , "ErrImagePull" , "CrashLoopBackOff" , "CreateContainerConfigError" , "InvalidImageName" :
294+ isRealFailure = true
295+ a .logger .Debugf ("Pod %s has init container error %s - real failure, not disruption" , pod .Name , reason )
296+ }
297+ }
298+ }
299+
300+ // If it's a real failure, don't treat it as expected disruption
301+ if isRealFailure {
302+ continue
303+ }
304+
305+ // If it's in expected disruption state, return true
306+ if isExpectedDisruption {
307+ a .logger .Debugf ("Pod %s is in expected disruption state" , pod .Name )
308+ return true
309+ }
310+
311+ // If pending without clear container status, check if it's just being scheduled
312+ // This could be normal pod creation during node drain
313+ if len (pod .Status .ContainerStatuses ) == 0 && len (pod .Status .InitContainerStatuses ) == 0 {
314+ a .logger .Debugf ("Pod %s is pending without container statuses - likely being scheduled" , pod .Name )
315+ return true
316+ }
317+ }
318+
319+ // Check container statuses for running pods that are restarting
320+ for _ , containerStatus := range pod .Status .ContainerStatuses {
321+ if containerStatus .State .Waiting != nil {
322+ reason := containerStatus .State .Waiting .Reason
323+ switch reason {
324+ case "ContainerCreating" , "PodInitializing" :
325+ a .logger .Debugf ("Pod %s container is starting - expected disruption" , pod .Name )
326+ return true
327+ case "ImagePullBackOff" , "ErrImagePull" , "CrashLoopBackOff" :
328+ // Real failures - don't treat as disruption
329+ a .logger .Debugf ("Pod %s has container error %s - not treating as disruption" , pod .Name , reason )
330+ }
331+ }
332+ }
333+ }
334+ }
335+ }
336+
337+ return false
338+ }
339+
171340func (a * Operator ) areAPIServicesAvailable (csv * v1alpha1.ClusterServiceVersion ) (bool , error ) {
172341 for _ , desc := range csv .Spec .APIServiceDefinitions .Owned {
173342 apiService , err := a .lister .APIRegistrationV1 ().APIServiceLister ().Get (desc .GetName ())
@@ -182,6 +351,15 @@ func (a *Operator) areAPIServicesAvailable(csv *v1alpha1.ClusterServiceVersion)
182351
183352 if ! install .IsAPIServiceAvailable (apiService ) {
184353 a .logger .Debugf ("APIService not available for %s" , desc .GetName ())
354+
355+ // Check if this unavailability is due to expected pod disruption
356+ // If so, we should not immediately mark as failed or trigger Progressing=True
357+ if a .isAPIServiceBackendDisrupted (csv , desc .GetName ()) {
358+ a .logger .Infof ("APIService %s unavailable due to pod disruption (e.g., node reboot), will retry" , desc .GetName ())
359+ // Return an error to trigger retry, but don't mark as definitively unavailable
360+ return false , olmerrors .NewRetryableError (fmt .Errorf ("APIService %s temporarily unavailable due to pod disruption" , desc .GetName ()))
361+ }
362+
185363 return false , nil
186364 }
187365
0 commit comments