Skip to content

Commit 964e02d

Browse files
Log pending cluster state tasks after cluster cleanup timeout (elastic#119186) (elastic#119220)
The Cluster Health API call in the `ESRestTestCase` cleanup waits for all cluster state tasks to be finished. We frequently see that API call timing out with plenty of tasks still in the queue. With this commit, we retrieve the pending tasks and log them - only if the Cluster Health API call timed out. This will hopefully give some more insight into what (kind of) tasks are still pending when the API call times out. Co-authored-by: Elastic Machine <[email protected]>
1 parent b4f900a commit 964e02d

File tree

1 file changed

+36
-14
lines changed

1 file changed

+36
-14
lines changed

test/framework/src/main/java/org/elasticsearch/test/rest/ESRestTestCase.java

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -872,8 +872,7 @@ protected boolean preserveSearchableSnapshotsIndicesUponCompletion() {
872872
}
873873

874874
private void wipeCluster() throws Exception {
875-
logger.info("Waiting for all cluster updates up to this moment to be processed");
876-
assertOK(adminClient().performRequest(new Request("GET", "_cluster/health?wait_for_events=languid")));
875+
waitForClusterUpdates();
877876

878877
// Cleanup rollup before deleting indices. A rollup job might have bulks in-flight,
879878
// so we need to fully shut them down first otherwise a job might stall waiting
@@ -1039,6 +1038,38 @@ private void wipeCluster() throws Exception {
10391038
deleteAllNodeShutdownMetadata();
10401039
}
10411040

1041+
private void waitForClusterUpdates() throws Exception {
1042+
logger.info("Waiting for all cluster updates up to this moment to be processed");
1043+
try {
1044+
assertOK(adminClient().performRequest(new Request("GET", "_cluster/health?wait_for_events=languid")));
1045+
} catch (ResponseException e) {
1046+
if (e.getResponse().getStatusLine().getStatusCode() == HttpStatus.SC_REQUEST_TIMEOUT) {
1047+
final var pendingTasks = getPendingClusterStateTasks();
1048+
if (pendingTasks != null) {
1049+
logger.error("Timed out waiting for cluster updates to be processed, {}", pendingTasks);
1050+
}
1051+
}
1052+
throw e;
1053+
}
1054+
}
1055+
1056+
private static String getPendingClusterStateTasks() {
1057+
try {
1058+
Response response = adminClient().performRequest(new Request("GET", "/_cluster/pending_tasks"));
1059+
List<?> tasks = (List<?>) entityAsMap(response).get("tasks");
1060+
if (false == tasks.isEmpty()) {
1061+
StringBuilder message = new StringBuilder("there are still running tasks:");
1062+
for (Object task : tasks) {
1063+
message.append('\n').append(task.toString());
1064+
}
1065+
return message.toString();
1066+
}
1067+
} catch (IOException e) {
1068+
fail(e, "Failed to retrieve pending tasks in the cluster during cleanup");
1069+
}
1070+
return null;
1071+
}
1072+
10421073
/**
10431074
* This method checks whether ILM policies or templates get recreated after they have been deleted. If so, we are probably deleting
10441075
* them unnecessarily, potentially causing test performance problems. This could happen for example if someone adds a new standard ILM
@@ -1488,18 +1519,9 @@ private void logIfThereAreRunningTasks() throws IOException {
14881519
*/
14891520
private static void waitForClusterStateUpdatesToFinish() throws Exception {
14901521
assertBusy(() -> {
1491-
try {
1492-
Response response = adminClient().performRequest(new Request("GET", "/_cluster/pending_tasks"));
1493-
List<?> tasks = (List<?>) entityAsMap(response).get("tasks");
1494-
if (false == tasks.isEmpty()) {
1495-
StringBuilder message = new StringBuilder("there are still running tasks:");
1496-
for (Object task : tasks) {
1497-
message.append('\n').append(task.toString());
1498-
}
1499-
fail(message.toString());
1500-
}
1501-
} catch (IOException e) {
1502-
fail("cannot get cluster's pending tasks: " + e.getMessage());
1522+
final var pendingTasks = getPendingClusterStateTasks();
1523+
if (pendingTasks != null) {
1524+
fail(pendingTasks);
15031525
}
15041526
}, 30, TimeUnit.SECONDS);
15051527
}

0 commit comments

Comments
 (0)