Skip to content

Commit 841711b

Browse files
committed
feat: log debug pod information before deleting pods by agent
1 parent f799bb2 commit 841711b

File tree

1 file changed

+63
-10
lines changed

1 file changed

+63
-10
lines changed

clearml_agent/glue/k8s.py

Lines changed: 63 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,20 +1037,73 @@ def _process_bash_lines_response(self, bash_cmd: str, raise_error=True):
10371037
]
10381038
return lines
10391039

1040-
def _delete_pods(self, selectors: List[str], namespace: str, msg: str = None) -> List[str]:
1041-
kubectl_cmd = \
1042-
"kubectl delete pod -l={agent_label} " \
1043-
"--namespace={namespace} --field-selector={selector} --output name".format(
1040+
def _delete_pods(
1041+
self, selectors: List[str], namespace: str, msg: str = None
1042+
) -> List[str]:
1043+
kubectl_get_cmd = (
1044+
"kubectl get pod -l={agent_label} "
1045+
"--namespace={namespace} --field-selector={selector} --output json".format(
10441046
selector=",".join(selectors),
10451047
agent_label=self._get_agent_label(),
10461048
namespace=namespace,
10471049
)
1048-
self.log.debug("Deleting old/failed pods{} for ns {}: {}".format(
1049-
msg or "", namespace, kubectl_cmd
1050-
))
1051-
lines = self._process_bash_lines_response(kubectl_cmd)
1052-
self.log.debug(" - deleted pods %s", ", ".join(lines))
1053-
return lines
1050+
)
1051+
pods_to_delete = json.loads(get_bash_output(kubectl_get_cmd))
1052+
1053+
pod_names = []
1054+
for pod in pods_to_delete.get("items", []):
1055+
pod_name = get_path(pod, "metadata", "name", default="unknown")
1056+
exit_code = get_path(
1057+
pod,
1058+
"status",
1059+
"containerStatuses",
1060+
"0",
1061+
"state",
1062+
"terminated",
1063+
"exitCode",
1064+
default="unknown",
1065+
)
1066+
reason = get_path(
1067+
pod,
1068+
"status",
1069+
"containerStatuses",
1070+
"0",
1071+
"state",
1072+
"terminated",
1073+
"reason",
1074+
default="unknown",
1075+
)
1076+
message = get_path(
1077+
pod,
1078+
"status",
1079+
"containerStatuses",
1080+
"0",
1081+
"state",
1082+
"terminated",
1083+
"message",
1084+
default="unknown",
1085+
)
1086+
self.log.debug(
1087+
"Pod {pod_name} exited with code {exit_code}, reason: {reason}, message: {message} - {msg}".format(
1088+
pod_name=pod_name,
1089+
exit_code=exit_code,
1090+
reason=reason,
1091+
message=message,
1092+
msg=msg or "",
1093+
)
1094+
)
1095+
1096+
_ = get_bash_output(
1097+
"kubectl delete pod {pod_name} --namespace={namespace}".format(
1098+
pod_name=pod_name,
1099+
namespace=namespace,
1100+
),
1101+
raise_error=False,
1102+
)
1103+
self.log.debug("Deleted pod {} - {}".format(pod_name, msg or ""))
1104+
pod_names.append(pod_name)
1105+
1106+
return pod_names
10541107

10551108
def _delete_jobs_by_names(self, names_to_ns: Dict[str, str], msg: str = None) -> List[str]:
10561109
if not names_to_ns:

0 commit comments

Comments
 (0)