Avoid repeating tasks in multiple files. Make sure cluster health is OK before play continues from last node

ivareri · ivareri · commit 950036771f2d · 2025-05-04T17:58:05.000+02:00
diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
@@ -52,30 +52,6 @@
     node_found: "{{ response.json | json_query(node_query) | length > 0 }}"
     node_query: "[?name=='{{ elasticsearch_nodename }}']"
 
-- name: Enable shard allocation for the cluster
-  ansible.builtin.uri:
-    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
-    method: PUT
-    body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
-    body_format: json
-    user: elastic
-    password: "{{ elasticstack_password.stdout }}"
-    validate_certs: no
-  register: response
-  # next line is boolean not string, so no quotes around true
-  # use python truthiness
-  until: "response.json.acknowledged == true"
-  retries: 5
-  delay: 30
-
-- name: Wait for cluster health to return to yellow or green
-  ansible.builtin.uri:
-    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
-    method: GET
-    user: elastic
-    password: "{{ elasticstack_password.stdout }}"
-    validate_certs: no
-  register: response
-  until: "response.json.status == 'yellow' or response.json.status == 'green'"
-  retries: 200
-  delay: 30
+# Don't continue the play unless cluster health is OK
+- name: Cluster health check
+  ansible.builtin.include_tasks: elasticsearch-wait-for-cluster-health.yml
diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
@@ -15,82 +15,10 @@
   ansible.builtin.set_fact:
     elasticsearch_http_protocol: "https"
 
-# Usually we should not need this step. It's only there to recover from broken upgrade plays
-# Without this step the cluster would never recover and the play would always fail
-- name: Enable shard allocation for the cluster
-  ansible.builtin.uri:
-    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
-    method: PUT
-    body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
-    body_format: json
-    user: elastic
-    password: "{{ elasticstack_password.stdout }}"
-    validate_certs: no
-  register: response
-  # next line is boolean not string, so no quotes around true
-  # use python truthiness
-  until: "response.json.acknowledged == true"
-  retries: 5
-  delay: 30
-
-
-#
-# Start cluster health check
-#
-
-# this step is key!!!  Don't restart more nodes until we can safely do so. This either requires a green cluster status, or a yellow status with 0 initializing or relocating shards
-#
-# From https://www.elastic.co/guide/en/elastic-stack/8.17/upgrading-elasticsearch.html
-## During a rolling upgrade, primary shards assigned to a node running the new version cannot have their replicas assigned to a node with the old version. The new version might have a different data format that is not understood by the old version.
-##
-## If it is not possible to assign the replica shards to another node (there is only one upgraded node in the cluster), the replica shards remain unassigned and status stays yellow.
-##
-## In this case, you can proceed once there are no initializing or relocating shards (check the init and relo columns).
-
-- name: Check cluster health
-  block:
-  - name: Wait for cluster health to return to green
-    ansible.builtin.uri:
-      url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
-      method: GET
-      user: elastic
-      password: "{{ elasticstack_password.stdout }}"
-      validate_certs: no
-    register: response
-    until: "response.json.status == 'green'"
-    retries: 50
-    delay: 30
-
-  # Timed out while waiting for green cluster
-  # Check if we can continue with a yellow cluster
-  rescue:
-    - name: "Rescue: Check if cluster health is yellow"
-      ansible.builtin.uri:
-        url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
-        method: GET
-        user: elastic
-        password: "{{ elasticstack_password.stdout }}"
-        validate_certs: no
-      register: response
-      failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"
-
-    - name: "Rescure: Wait before verifying status"
-      ansible.builtin.pause:
-        seconds: 10
-
-    - name: "Rescue: Verify we can safely continue with yellow cluster"
-      ansible.builtin.uri:
-        url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
-        method: GET
-        user: elastic
-        password: "{{ elasticstack_password.stdout }}"
-        validate_certs: no
-      register: response
-      failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"
-
-#
-# End cluster health check
-#
+# This step is here primarily in order to recover from broken/restarted upgrade or rolling restart.
+# TODO: Only run this task for the first host.
+- name: Cluster health check
+  ansible.builtin.include_tasks: elasticsearch-wait-for-cluster-health.yml
 
 
 # Disabling shard allocation right after enabling it seems redundant. Please see above for details.
diff --git a/roles/elasticsearch/tasks/elasticsearch-wait-for-cluster-health.yml b/roles/elasticsearch/tasks/elasticsearch-wait-for-cluster-health.yml
@@ -0,0 +1,66 @@
+---
+
+# Make sure shard allocation is enabled
+- name: Enable shard allocation for the cluster
+  ansible.builtin.uri:
+    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
+    method: PUT
+    body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
+    body_format: json
+    user: elastic
+    password: "{{ elasticstack_password.stdout }}"
+    validate_certs: no
+  register: response
+  # next line is boolean not string, so no quotes around true
+  # use python truthiness
+  until: "response.json.acknowledged == true"
+  retries: 5
+  delay: 30
+
+# From https://www.elastic.co/guide/en/elastic-stack/8.17/upgrading-elasticsearch.html
+## During a rolling upgrade, primary shards assigned to a node running the new version cannot have their replicas assigned to a node with the old version. The new version might have a different data format that is not understood by the old version.
+##
+## If it is not possible to assign the replica shards to another node (there is only one upgraded node in the cluster), the replica shards remain unassigned and status stays yellow.
+##
+## In this case, you can proceed once there are no initializing or relocating shards (check the init and relo columns).
+
+- name: Check cluster health
+  block:
+  - name: Wait for cluster health to return to green
+    ansible.builtin.uri:
+      url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
+      method: GET
+      user: elastic
+      password: "{{ elasticstack_password.stdout }}"
+      validate_certs: no
+    register: response
+    until: "response.json.status == 'green'"
+    retries: 50
+    delay: 30
+
+  # Timed out while waiting for green cluster
+  # Check if we can continue with a yellow cluster
+  rescue:
+    - name: "Rescue: Check if cluster health is yellow"
+      ansible.builtin.uri:
+        url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
+        method: GET
+        user: elastic
+        password: "{{ elasticstack_password.stdout }}"
+        validate_certs: no
+      register: response
+      failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"
+
+    - name: "Rescure: Wait before verifying status"
+      ansible.builtin.pause:
+        seconds: 10
+
+    - name: "Rescue: Verify we can safely continue with yellow cluster"
+      ansible.builtin.uri:
+        url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
+        method: GET
+        user: elastic
+        password: "{{ elasticstack_password.stdout }}"
+        validate_certs: no
+      register: response
+      failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"