@@ -25,6 +25,7 @@ import (
2525 "slices"
2626 "strconv"
2727 "strings"
28+ "sync"
2829 "time"
2930
3031 "github.com/aenix-io/etcd-operator/internal/log"
@@ -47,6 +48,10 @@ import (
4748 clientv3 "go.etcd.io/etcd/client/v3"
4849)
4950
51+ const (
52+ etcdDefaultTimeout = 5 * time .Second
53+ )
54+
5055// EtcdClusterReconciler reconciles a EtcdCluster object
5156type EtcdClusterReconciler struct {
5257 client.Client
@@ -56,6 +61,7 @@ type EtcdClusterReconciler struct {
5661// +kubebuilder:rbac:groups=etcd.aenix.io,resources=etcdclusters,verbs=get;list;watch;create;update;patch;delete
5762// +kubebuilder:rbac:groups=etcd.aenix.io,resources=etcdclusters/status,verbs=get;update;patch
5863// +kubebuilder:rbac:groups=etcd.aenix.io,resources=etcdclusters/finalizers,verbs=update
64+ // +kubebuilder:rbac:groups="",resources=endpoints,verbs=get;list;watch
5965// +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch;create;update;watch;delete;patch
6066// +kubebuilder:rbac:groups="",resources=services,verbs=get;create;delete;update;patch;list;watch
6167// +kubebuilder:rbac:groups="",resources=secrets,verbs=view;list;watch
@@ -80,13 +86,68 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
8086 return reconcile.Result {}, nil
8187 }
8288
89+ state := observables {}
90+
91+ // create two services and the pdb
92+ err = r .ensureUnconditionalObjects (ctx , instance )
93+ if err != nil {
94+ return ctrl.Result {}, err
95+ }
96+
97+ // fetch STS if exists
98+ err = r .Get (ctx , req .NamespacedName , & state .statefulSet )
99+ if client .IgnoreNotFound (err ) != nil {
100+ return ctrl.Result {}, fmt .Errorf ("couldn't get statefulset: %w" , err )
101+ }
102+ state .stsExists = state .statefulSet .UID != ""
103+
104+ // fetch endpoints
105+ clusterClient , singleClients , err := factory .NewEtcdClientSet (ctx , instance , r .Client )
106+ if err != nil {
107+ return ctrl.Result {}, err
108+ }
109+ state .endpointsFound = clusterClient != nil && singleClients != nil
110+
111+ if ! state .endpointsFound {
112+ if ! state .stsExists {
113+ // TODO: happy path for new cluster creation
114+ log .Debug (ctx , "happy path for new cluster creation (not yet implemented)" )
115+ }
116+ }
117+
118+ // get status of every endpoint and member list from every endpoint
119+ state .etcdStatuses = make ([]etcdStatus , len (singleClients ))
120+ {
121+ var wg sync.WaitGroup
122+ ctx , cancel := context .WithTimeout (ctx , etcdDefaultTimeout )
123+ for i := range singleClients {
124+ wg .Add (1 )
125+ go func (i int ) {
126+ defer wg .Done ()
127+ state .etcdStatuses [i ].fill (ctx , singleClients [i ])
128+ }(i )
129+ }
130+ wg .Wait ()
131+ cancel ()
132+ }
133+ state .setClusterID ()
134+ if state .inSplitbrain () {
135+ log .Error (ctx , fmt .Errorf ("etcd cluster in splitbrain" ), "etcd cluster in splitbrain, dropping from reconciliation queue" )
136+ factory .SetCondition (instance , factory .NewCondition (etcdaenixiov1alpha1 .EtcdConditionError ).
137+ WithStatus (true ).
138+ WithReason (string (etcdaenixiov1alpha1 .EtcdCondTypeSplitbrain )).
139+ WithMessage (string (etcdaenixiov1alpha1 .EtcdErrorCondSplitbrainMessage )).
140+ Complete (),
141+ )
142+ return r .updateStatus (ctx , instance )
143+ }
83144 // fill conditions
84145 if len (instance .Status .Conditions ) == 0 {
85146 factory .FillConditions (instance )
86147 }
87148
88149 // ensure managed resources
89- if err = r .ensureClusterObjects (ctx , instance ); err != nil {
150+ if err = r .ensureConditionalClusterObjects (ctx , instance ); err != nil {
90151 return r .updateStatusOnErr (ctx , instance , fmt .Errorf ("cannot create Cluster auxiliary objects: %w" , err ))
91152 }
92153
@@ -138,8 +199,8 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
138199 return r .updateStatus (ctx , instance )
139200}
140201
141- // ensureClusterObjects creates or updates all objects owned by cluster CR
142- func (r * EtcdClusterReconciler ) ensureClusterObjects (
202+ // ensureConditionalClusterObjects creates or updates all objects owned by cluster CR
203+ func (r * EtcdClusterReconciler ) ensureConditionalClusterObjects (
143204 ctx context.Context , cluster * etcdaenixiov1alpha1.EtcdCluster ) error {
144205
145206 if err := factory .CreateOrUpdateClusterStateConfigMap (ctx , cluster , r .Client ); err != nil {
@@ -148,30 +209,12 @@ func (r *EtcdClusterReconciler) ensureClusterObjects(
148209 }
149210 log .Debug (ctx , "cluster state configmap reconciled" )
150211
151- if err := factory .CreateOrUpdateHeadlessService (ctx , cluster , r .Client ); err != nil {
152- log .Error (ctx , err , "reconcile headless service failed" )
153- return err
154- }
155- log .Debug (ctx , "headless service reconciled" )
156-
157212 if err := factory .CreateOrUpdateStatefulSet (ctx , cluster , r .Client ); err != nil {
158213 log .Error (ctx , err , "reconcile statefulset failed" )
159214 return err
160215 }
161216 log .Debug (ctx , "statefulset reconciled" )
162217
163- if err := factory .CreateOrUpdateClientService (ctx , cluster , r .Client ); err != nil {
164- log .Error (ctx , err , "reconcile client service failed" )
165- return err
166- }
167- log .Debug (ctx , "client service reconciled" )
168-
169- if err := factory .CreateOrUpdatePdb (ctx , cluster , r .Client ); err != nil {
170- log .Error (ctx , err , "reconcile pdb failed" )
171- return err
172- }
173- log .Debug (ctx , "pdb reconciled" )
174-
175218 return nil
176219}
177220
@@ -498,3 +541,57 @@ func (r *EtcdClusterReconciler) disableAuth(ctx context.Context, authClient clie
498541
499542 return nil
500543}
544+
545+ // ensureUnconditionalObjects creates the two services and the PDB
546+ // which can be created at the start of the reconciliation loop
547+ // without any risk of disrupting the etcd cluster
548+ func (r * EtcdClusterReconciler ) ensureUnconditionalObjects (ctx context.Context , instance * etcdaenixiov1alpha1.EtcdCluster ) error {
549+ const concurrentOperations = 3
550+ c := make (chan error )
551+ defer close (c )
552+ ctx , cancel := context .WithCancel (ctx )
553+ defer cancel ()
554+ var wg sync.WaitGroup
555+ wg .Add (concurrentOperations )
556+ wrapWithMsg := func (err error , msg string ) error {
557+ if err != nil {
558+ return fmt .Errorf (msg + ": %w" , err )
559+ }
560+ return nil
561+ }
562+ go func (chan <- error ) {
563+ defer wg .Done ()
564+ select {
565+ case <- ctx .Done ():
566+ case c <- wrapWithMsg (factory .CreateOrUpdateClientService (ctx , instance , r .Client ),
567+ "couldn't ensure client service" ):
568+ }
569+ }(c )
570+ go func (chan <- error ) {
571+ defer wg .Done ()
572+ select {
573+ case <- ctx .Done ():
574+ case c <- wrapWithMsg (factory .CreateOrUpdateHeadlessService (ctx , instance , r .Client ),
575+ "couldn't ensure headless service" ):
576+ }
577+ }(c )
578+ go func (chan <- error ) {
579+ defer wg .Done ()
580+ select {
581+ case <- ctx .Done ():
582+ case c <- wrapWithMsg (factory .CreateOrUpdatePdb (ctx , instance , r .Client ),
583+ "couldn't ensure pod disruption budget" ):
584+ }
585+ }(c )
586+
587+ for i := 0 ; i < concurrentOperations ; i ++ {
588+ if err := <- c ; err != nil {
589+ cancel ()
590+
591+ // let all goroutines select the ctx.Done() case to avoid races on closed channels
592+ wg .Wait ()
593+ return err
594+ }
595+ }
596+ return nil
597+ }
0 commit comments