|
15 | 15 | ansible.builtin.set_fact: |
16 | 16 | elasticsearch_http_protocol: "https" |
17 | 17 |
|
18 | | -# Usually we should not need this step. It's only there to recover from broken upgrade plays |
19 | | -# Without this step the cluster would never recover and the play would always fail |
20 | | -- name: Enable shard allocation for the cluster |
21 | | - ansible.builtin.uri: |
22 | | - url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings" |
23 | | - method: PUT |
24 | | - body: '{ "persistent": { "cluster.routing.allocation.enable": null }}' |
25 | | - body_format: json |
26 | | - user: elastic |
27 | | - password: "{{ elasticstack_password.stdout }}" |
28 | | - validate_certs: no |
29 | | - register: response |
30 | | - # next line is boolean not string, so no quotes around true |
31 | | - # use python truthiness |
32 | | - until: "response.json.acknowledged == true" |
33 | | - retries: 5 |
34 | | - delay: 30 |
35 | | - |
36 | | - |
37 | | -# |
38 | | -# Start cluster health check |
39 | | -# |
40 | | - |
41 | | -# this step is key!!! Don't restart more nodes until we can safely do so. This either requires a green cluster status, or a yellow status with 0 initializing or relocating shards |
42 | | -# |
43 | | -# From https://www.elastic.co/guide/en/elastic-stack/8.17/upgrading-elasticsearch.html |
44 | | -## During a rolling upgrade, primary shards assigned to a node running the new version cannot have their replicas assigned to a node with the old version. The new version might have a different data format that is not understood by the old version. |
45 | | -## |
46 | | -## If it is not possible to assign the replica shards to another node (there is only one upgraded node in the cluster), the replica shards remain unassigned and status stays yellow. |
47 | | -## |
48 | | -## In this case, you can proceed once there are no initializing or relocating shards (check the init and relo columns). |
49 | | - |
50 | | -- name: Check cluster health |
51 | | - block: |
52 | | - - name: Wait for cluster health to return to green |
53 | | - ansible.builtin.uri: |
54 | | - url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health" |
55 | | - method: GET |
56 | | - user: elastic |
57 | | - password: "{{ elasticstack_password.stdout }}" |
58 | | - validate_certs: no |
59 | | - register: response |
60 | | - until: "response.json.status == 'green'" |
61 | | - retries: 50 |
62 | | - delay: 30 |
63 | | - |
64 | | - # Timed out while waiting for green cluster |
65 | | - # Check if we can continue with a yellow cluster |
66 | | - rescue: |
67 | | - - name: "Rescue: Check if cluster health is yellow" |
68 | | - ansible.builtin.uri: |
69 | | - url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health" |
70 | | - method: GET |
71 | | - user: elastic |
72 | | - password: "{{ elasticstack_password.stdout }}" |
73 | | - validate_certs: no |
74 | | - register: response |
75 | | - failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0" |
76 | | - |
77 | | - - name: "Rescure: Wait before verifying status" |
78 | | - ansible.builtin.pause: |
79 | | - seconds: 10 |
80 | | - |
81 | | - - name: "Rescue: Verify we can safely continue with yellow cluster" |
82 | | - ansible.builtin.uri: |
83 | | - url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health" |
84 | | - method: GET |
85 | | - user: elastic |
86 | | - password: "{{ elasticstack_password.stdout }}" |
87 | | - validate_certs: no |
88 | | - register: response |
89 | | - failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0" |
90 | | - |
91 | | -# |
92 | | -# End cluster health check |
93 | | -# |
| 18 | +# This step is here primarily in order to recover from broken/restarted upgrade or rolling restart. |
| 19 | +# TODO: Only run this task for the first host. |
| 20 | +- name: Cluster health check |
| 21 | + ansible.builtin.include_tasks: elasticsearch-wait-for-cluster-health.yml |
94 | 22 |
|
95 | 23 |
|
96 | 24 | # Disabling shard allocation right after enabling it seems redundant. Please see above for details. |
|
0 commit comments