@@ -13,57 +13,171 @@ import (
1313 "github.com/siderolabs/talos/pkg/machinery/config/machine"
1414)
1515
16+ // PreBootSequenceChecks
17+ const (
18+ CheckEtcdHealthy = "etcd to be healthy"
19+ CheckEtcdConsistent = "etcd members to be consistent across nodes"
20+ CheckEtcdControlPlane = "etcd members to be control plane nodes"
21+ CheckApidReady = "apid to be ready"
22+ CheckAllNodesMemorySizes = "all nodes memory sizes"
23+ CheckAllNodesDiskSizes = "all nodes disk sizes"
24+ CheckNoDiagnostics = "no diagnostics"
25+ CheckKubeletHealthy = "kubelet to be healthy"
26+ CheckAllNodesBootSequenceFinished = "all nodes to finish boot sequence"
27+ )
28+
29+ // K8sComponentsReadinessChecks
30+ const (
31+ CheckK8sAllNodesReported = "all k8s nodes to report"
32+ CheckControlPlaneStaticPodsRunning = "all control plane static pods to be running"
33+ CheckControlPlaneComponentsReady = "all control plane components to be ready"
34+ )
35+
36+ // DefaultClusterChecks
37+ const (
38+ CheckK8sAllNodesReady = "all k8s nodes to report ready"
39+ CheckKubeProxyReady = "kube-proxy to report ready"
40+ CheckCoreDNSReady = "coredns to report ready"
41+ CheckK8sNodesSchedulable = "all k8s nodes to report schedulable"
42+ )
43+
44+ func getCheck (name string ) ClusterCheck {
45+ switch name {
46+ // PreBootSequenceChecks
47+ case CheckEtcdHealthy :
48+ return func (cluster ClusterInfo ) conditions.Condition {
49+ return conditions .PollingCondition (CheckEtcdHealthy , func (ctx context.Context ) error {
50+ return ServiceHealthAssertion (ctx , cluster , "etcd" , WithNodeTypes (machine .TypeInit , machine .TypeControlPlane ))
51+ }, 5 * time .Minute , 5 * time .Second )
52+ }
53+ case CheckEtcdConsistent :
54+ return func (cluster ClusterInfo ) conditions.Condition {
55+ return conditions .PollingCondition (CheckEtcdConsistent , func (ctx context.Context ) error {
56+ return EtcdConsistentAssertion (ctx , cluster )
57+ }, 5 * time .Minute , 5 * time .Second )
58+ }
59+ case CheckEtcdControlPlane :
60+ return func (cluster ClusterInfo ) conditions.Condition {
61+ return conditions .PollingCondition (CheckEtcdControlPlane , func (ctx context.Context ) error {
62+ return EtcdControlPlaneNodesAssertion (ctx , cluster )
63+ }, 5 * time .Minute , 5 * time .Second )
64+ }
65+ case CheckApidReady :
66+ return func (cluster ClusterInfo ) conditions.Condition {
67+ return conditions .PollingCondition (CheckApidReady , func (ctx context.Context ) error {
68+ return ApidReadyAssertion (ctx , cluster )
69+ }, 5 * time .Minute , 5 * time .Second )
70+ }
71+ case CheckAllNodesMemorySizes :
72+ return func (cluster ClusterInfo ) conditions.Condition {
73+ return conditions .PollingCondition (CheckAllNodesMemorySizes , func (ctx context.Context ) error {
74+ return AllNodesMemorySizes (ctx , cluster )
75+ }, 5 * time .Minute , 5 * time .Second )
76+ }
77+ case CheckAllNodesDiskSizes :
78+ return func (cluster ClusterInfo ) conditions.Condition {
79+ return conditions .PollingCondition (CheckAllNodesDiskSizes , func (ctx context.Context ) error {
80+ return AllNodesDiskSizes (ctx , cluster )
81+ }, 5 * time .Minute , 5 * time .Second )
82+ }
83+ case CheckNoDiagnostics :
84+ return func (cluster ClusterInfo ) conditions.Condition {
85+ return conditions .PollingCondition (CheckNoDiagnostics , func (ctx context.Context ) error {
86+ return NoDiagnostics (ctx , cluster )
87+ }, time .Minute , 5 * time .Second )
88+ }
89+ case CheckKubeletHealthy :
90+ return func (cluster ClusterInfo ) conditions.Condition {
91+ return conditions .PollingCondition (CheckKubeletHealthy , func (ctx context.Context ) error {
92+ return ServiceHealthAssertion (ctx , cluster , "kubelet" , WithNodeTypes (machine .TypeInit , machine .TypeControlPlane ))
93+ }, 5 * time .Minute , 5 * time .Second )
94+ }
95+ case CheckAllNodesBootSequenceFinished :
96+ return func (cluster ClusterInfo ) conditions.Condition {
97+ return conditions .PollingCondition (CheckAllNodesBootSequenceFinished , func (ctx context.Context ) error {
98+ return AllNodesBootedAssertion (ctx , cluster )
99+ }, 5 * time .Minute , 5 * time .Second )
100+ }
101+
102+ // K8sComponentsReadinessChecks
103+ case CheckK8sAllNodesReported :
104+ return func (cluster ClusterInfo ) conditions.Condition {
105+ return conditions .PollingCondition (CheckK8sAllNodesReported , func (ctx context.Context ) error {
106+ return K8sAllNodesReportedAssertion (ctx , cluster )
107+ }, 5 * time .Minute , 30 * time .Second )
108+ }
109+ case CheckControlPlaneStaticPodsRunning :
110+ return func (cluster ClusterInfo ) conditions.Condition {
111+ return conditions .PollingCondition (CheckControlPlaneStaticPodsRunning , func (ctx context.Context ) error {
112+ return K8sControlPlaneStaticPods (ctx , cluster )
113+ }, 5 * time .Minute , 5 * time .Second )
114+ }
115+ case CheckControlPlaneComponentsReady :
116+ return func (cluster ClusterInfo ) conditions.Condition {
117+ return conditions .PollingCondition (CheckControlPlaneComponentsReady , func (ctx context.Context ) error {
118+ return K8sFullControlPlaneAssertion (ctx , cluster )
119+ }, 5 * time .Minute , 5 * time .Second )
120+ }
121+
122+ // Additional Checks for Default Cluster Checks
123+ case CheckK8sAllNodesReady :
124+ return func (cluster ClusterInfo ) conditions.Condition {
125+ return conditions .PollingCondition (CheckK8sAllNodesReady , func (ctx context.Context ) error {
126+ return K8sAllNodesReadyAssertion (ctx , cluster )
127+ }, 10 * time .Minute , 5 * time .Second )
128+ }
129+ case CheckKubeProxyReady :
130+ return func (cluster ClusterInfo ) conditions.Condition {
131+ return conditions .PollingCondition (CheckKubeProxyReady , func (ctx context.Context ) error {
132+ present , replicas , err := DaemonSetPresent (ctx , cluster , "kube-system" , "k8s-app=kube-proxy" )
133+ if err != nil {
134+ return err
135+ }
136+ if ! present {
137+ return conditions .ErrSkipAssertion
138+ }
139+ return K8sPodReadyAssertion (ctx , cluster , replicas , "kube-system" , "k8s-app=kube-proxy" )
140+ }, 5 * time .Minute , 5 * time .Second )
141+ }
142+ case CheckCoreDNSReady :
143+ return func (cluster ClusterInfo ) conditions.Condition {
144+ return conditions .PollingCondition (CheckCoreDNSReady , func (ctx context.Context ) error {
145+ present , replicas , err := DeploymentPresent (ctx , cluster , "kube-system" , "k8s-app=kube-dns" )
146+ if err != nil {
147+ return err
148+ }
149+ if ! present {
150+ return conditions .ErrSkipAssertion
151+ }
152+ return K8sPodReadyAssertion (ctx , cluster , replicas , "kube-system" , "k8s-app=kube-dns" )
153+ }, 5 * time .Minute , 5 * time .Second )
154+ }
155+ case CheckK8sNodesSchedulable :
156+ return func (cluster ClusterInfo ) conditions.Condition {
157+ return conditions .PollingCondition (CheckK8sNodesSchedulable , func (ctx context.Context ) error {
158+ return K8sAllNodesSchedulableAssertion (ctx , cluster )
159+ }, 5 * time .Minute , 5 * time .Second )
160+ }
161+ default :
162+ panic ("unknown check name: " + name )
163+ }
164+ }
165+
16166// DefaultClusterChecks returns a set of default Talos cluster readiness checks.
17167func DefaultClusterChecks () []ClusterCheck {
168+ // Concatenate pre-boot, Kubernetes component, and additional checks.
18169 return slices .Concat (
19170 PreBootSequenceChecks (),
20171 K8sComponentsReadinessChecks (),
21172 []ClusterCheck {
22173 // wait for all the nodes to report ready at k8s level
23- func (cluster ClusterInfo ) conditions.Condition {
24- return conditions .PollingCondition ("all k8s nodes to report ready" , func (ctx context.Context ) error {
25- return K8sAllNodesReadyAssertion (ctx , cluster )
26- }, 10 * time .Minute , 5 * time .Second )
27- },
28-
174+ getCheck (CheckK8sAllNodesReady ),
29175 // wait for kube-proxy to report ready
30- func (cluster ClusterInfo ) conditions.Condition {
31- return conditions .PollingCondition ("kube-proxy to report ready" , func (ctx context.Context ) error {
32- present , replicas , err := DaemonSetPresent (ctx , cluster , "kube-system" , "k8s-app=kube-proxy" )
33- if err != nil {
34- return err
35- }
36-
37- if ! present {
38- return conditions .ErrSkipAssertion
39- }
40-
41- return K8sPodReadyAssertion (ctx , cluster , replicas , "kube-system" , "k8s-app=kube-proxy" )
42- }, 5 * time .Minute , 5 * time .Second )
43- },
44-
176+ getCheck (CheckKubeProxyReady ),
45177 // wait for coredns to report ready
46- func (cluster ClusterInfo ) conditions.Condition {
47- return conditions .PollingCondition ("coredns to report ready" , func (ctx context.Context ) error {
48- present , replicas , err := DeploymentPresent (ctx , cluster , "kube-system" , "k8s-app=kube-dns" )
49- if err != nil {
50- return err
51- }
52-
53- if ! present {
54- return conditions .ErrSkipAssertion
55- }
56-
57- return K8sPodReadyAssertion (ctx , cluster , replicas , "kube-system" , "k8s-app=kube-dns" )
58- }, 5 * time .Minute , 5 * time .Second )
59- },
60-
178+ getCheck (CheckCoreDNSReady ),
61179 // wait for all the nodes to be schedulable
62- func (cluster ClusterInfo ) conditions.Condition {
63- return conditions .PollingCondition ("all k8s nodes to report schedulable" , func (ctx context.Context ) error {
64- return K8sAllNodesSchedulableAssertion (ctx , cluster )
65- }, 5 * time .Minute , 5 * time .Second )
66- },
180+ getCheck (CheckK8sNodesSchedulable ),
67181 },
68182 )
69183}
@@ -74,25 +188,11 @@ func DefaultClusterChecks() []ClusterCheck {
74188func K8sComponentsReadinessChecks () []ClusterCheck {
75189 return []ClusterCheck {
76190 // wait for all the nodes to report in at k8s level
77- func (cluster ClusterInfo ) conditions.Condition {
78- return conditions .PollingCondition ("all k8s nodes to report" , func (ctx context.Context ) error {
79- return K8sAllNodesReportedAssertion (ctx , cluster )
80- }, 5 * time .Minute , 30 * time .Second ) // give more time per each attempt, as this check is going to build and cache kubeconfig
81- },
82-
191+ getCheck (CheckK8sAllNodesReported ),
83192 // wait for k8s control plane static pods
84- func (cluster ClusterInfo ) conditions.Condition {
85- return conditions .PollingCondition ("all control plane static pods to be running" , func (ctx context.Context ) error {
86- return K8sControlPlaneStaticPods (ctx , cluster )
87- }, 5 * time .Minute , 5 * time .Second )
88- },
89-
193+ getCheck (CheckControlPlaneStaticPodsRunning ),
90194 // wait for HA k8s control plane
91- func (cluster ClusterInfo ) conditions.Condition {
92- return conditions .PollingCondition ("all control plane components to be ready" , func (ctx context.Context ) error {
93- return K8sFullControlPlaneAssertion (ctx , cluster )
94- }, 5 * time .Minute , 5 * time .Second )
95- },
195+ getCheck (CheckControlPlaneComponentsReady ),
96196 }
97197}
98198
@@ -103,70 +203,47 @@ func ExtraClusterChecks() []ClusterCheck {
103203 return []ClusterCheck {}
104204}
105205
206+ // preBootSequenceCheckNames returns the list of pre-boot check names.
207+ func preBootSequenceCheckNames () []string {
208+ return []string {
209+ CheckEtcdHealthy ,
210+ CheckEtcdConsistent ,
211+ CheckEtcdControlPlane ,
212+ CheckApidReady ,
213+ CheckAllNodesMemorySizes ,
214+ CheckAllNodesDiskSizes ,
215+ CheckNoDiagnostics ,
216+ CheckKubeletHealthy ,
217+ CheckAllNodesBootSequenceFinished ,
218+ }
219+ }
220+
106221// PreBootSequenceChecks returns a set of Talos cluster readiness checks which are run before boot sequence.
107222func PreBootSequenceChecks () []ClusterCheck {
108- return []ClusterCheck {
109- // wait for etcd to be healthy on all control plane nodes
110- func (cluster ClusterInfo ) conditions.Condition {
111- return conditions .PollingCondition ("etcd to be healthy" , func (ctx context.Context ) error {
112- return ServiceHealthAssertion (ctx , cluster , "etcd" , WithNodeTypes (machine .TypeInit , machine .TypeControlPlane ))
113- }, 5 * time .Minute , 5 * time .Second )
114- },
115-
116- // wait for etcd members to be consistent across nodes
117- func (cluster ClusterInfo ) conditions.Condition {
118- return conditions .PollingCondition ("etcd members to be consistent across nodes" , func (ctx context.Context ) error {
119- return EtcdConsistentAssertion (ctx , cluster )
120- }, 5 * time .Minute , 5 * time .Second )
121- },
122-
123- // wait for etcd members to be the control plane nodes
124- func (cluster ClusterInfo ) conditions.Condition {
125- return conditions .PollingCondition ("etcd members to be control plane nodes" , func (ctx context.Context ) error {
126- return EtcdControlPlaneNodesAssertion (ctx , cluster )
127- }, 5 * time .Minute , 5 * time .Second )
128- },
129-
130- // wait for apid to be ready on all the nodes
131- func (cluster ClusterInfo ) conditions.Condition {
132- return conditions .PollingCondition ("apid to be ready" , func (ctx context.Context ) error {
133- return ApidReadyAssertion (ctx , cluster )
134- }, 5 * time .Minute , 5 * time .Second )
135- },
136-
137- // wait for all nodes to report their memory size
138- func (cluster ClusterInfo ) conditions.Condition {
139- return conditions .PollingCondition ("all nodes memory sizes" , func (ctx context.Context ) error {
140- return AllNodesMemorySizes (ctx , cluster )
141- }, 5 * time .Minute , 5 * time .Second )
142- },
143-
144- // wait for all nodes to report their disk size
145- func (cluster ClusterInfo ) conditions.Condition {
146- return conditions .PollingCondition ("all nodes disk sizes" , func (ctx context.Context ) error {
147- return AllNodesDiskSizes (ctx , cluster )
148- }, 5 * time .Minute , 5 * time .Second )
149- },
150-
151- // check diagnostics
152- func (cluster ClusterInfo ) conditions.Condition {
153- return conditions .PollingCondition ("no diagnostics" , func (ctx context.Context ) error {
154- return NoDiagnostics (ctx , cluster )
155- }, time .Minute , 5 * time .Second )
156- },
223+ return PreBootSequenceChecksFiltered (nil )
224+ }
157225
158- // wait for kubelet to be healthy on all
159- func (cluster ClusterInfo ) conditions.Condition {
160- return conditions .PollingCondition ("kubelet to be healthy" , func (ctx context.Context ) error {
161- return ServiceHealthAssertion (ctx , cluster , "kubelet" , WithNodeTypes (machine .TypeInit , machine .TypeControlPlane ))
162- }, 5 * time .Minute , 5 * time .Second )
163- },
226+ // PreBootSequenceChecksFiltered returns a filtered version of the PreBootSequenceChecks,
227+ // removing any checks whose names appear in the provided 'skips' list.
228+ func PreBootSequenceChecksFiltered (skips []string ) []ClusterCheck {
229+ checkNames := []string {
230+ CheckEtcdHealthy ,
231+ CheckEtcdConsistent ,
232+ CheckEtcdControlPlane ,
233+ CheckApidReady ,
234+ CheckAllNodesMemorySizes ,
235+ CheckAllNodesDiskSizes ,
236+ CheckNoDiagnostics ,
237+ CheckKubeletHealthy ,
238+ CheckAllNodesBootSequenceFinished ,
239+ }
164240
165- // wait for all nodes to finish booting
166- func ( cluster ClusterInfo ) conditions. Condition {
167- return conditions . PollingCondition ( "all nodes to finish boot sequence" , func ( ctx context. Context ) error {
168- return AllNodesBootedAssertion ( ctx , cluster )
169- }, 5 * time . Minute , 5 * time . Second )
170- },
241+ var filtered [] ClusterCheck
242+ for _ , name := range checkNames {
243+ if slices . Contains ( skips , name ) {
244+ continue
245+ }
246+ filtered = append ( filtered , getCheck ( name ))
171247 }
248+ return filtered
172249}
0 commit comments