Skip to content

Commit 1684231

Browse files
committed
Allow upgrade to continue with yellow cluster health under certain condiditons
1 parent 750356c commit 1684231

File tree

1 file changed

+32
-3
lines changed

1 file changed

+32
-3
lines changed

roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,14 @@
3333
retries: 5
3434
delay: 30
3535

36-
# this step is key!!! Don't restart more nodes
37-
# until all shards have completed recovery
36+
# this step is key!!! Don't restart more nodes until we can safely do so. This either requires a green cluster status, or a yellow status with 0 initializing or relocating shards
37+
#
38+
# From https://www.elastic.co/guide/en/elastic-stack/8.17/upgrading-elasticsearch.html
39+
## During a rolling upgrade, primary shards assigned to a node running the new version cannot have their replicas assigned to a node with the old version. The new version might have a different data format that is not understood by the old version.
40+
##
41+
## If it is not possible to assign the replica shards to another node (there is only one upgraded node in the cluster), the replica shards remain unassigned and status stays yellow.
42+
##
43+
## In this case, you can proceed once there are no initializing or relocating shards (check the init and relo columns).
3844
- name: Wait for cluster health to return to green
3945
ansible.builtin.uri:
4046
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
@@ -43,10 +49,33 @@
4349
password: "{{ elasticstack_password.stdout }}"
4450
validate_certs: no
4551
register: response
46-
until: "response.json.status == 'green'"
52+
until: "response.json.status == 'green' or
53+
( response.json.status == 'yellow' and
54+
response.json.relocating_shards == 0 and
55+
response.json.initializing_shards == 0
56+
)"
4757
retries: 50
4858
delay: 30
4959

60+
# Extra safety in case we continune with a yellow cluster
61+
# Wait a short time, then check cluster status again
62+
- name: "Attempting to contune with yellow cluster health"
63+
when: "response.json.status == 'yellow'"
64+
block:
65+
- name: "Cluster health yellow: Wait before verifying status"
66+
ansible.builtin.pause:
67+
seconds: 10
68+
69+
- name: "Cluster health yellow: Verify we can safely continue"
70+
ansible.builtin.uri:
71+
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
72+
method: GET
73+
user: elastic
74+
password: "{{ elasticstack_password.stdout }}"
75+
validate_certs: no
76+
register: response1
77+
failed_when: "response1.json.relocating_shards != 0 or response1.json.initializing_shards != 0"
78+
5079
# Disabling shard allocation right after enabling it seems redundant. Please see above for details.
5180
- name: Disable shard allocation for the cluster
5281
ansible.builtin.uri:

0 commit comments

Comments
 (0)