From 5d482becc55f05e0315679741816654bc30bd9d4 Mon Sep 17 00:00:00 2001 From: Dan Blanchard Date: Thu, 20 Nov 2025 11:19:11 -0500 Subject: [PATCH 1/4] [k8s] Fix pod name and worker index mismatch Fixes #8024 by updating `filter_pods` to sort pods by name (and numeric worker suffix). This ordering is then used downstream when creating command runners for each pod. This ensures that the worker numbers used by users match the actual pod numbering/names in Kubernetes. --- sky/provision/kubernetes/utils.py | 28 ++++- .../kubernetes/test_kubernetes_utils.py | 102 ++++++++++++++++++ 2 files changed, 128 insertions(+), 2 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index bf239bf639f..d4d903ea062 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -3136,7 +3136,11 @@ def filter_pods(namespace: str, context: Optional[str], tag_filters: Dict[str, str], status_filters: Optional[List[str]] = None) -> Dict[str, Any]: - """Filters pods by tags and status.""" + """Filters pods by tags and status. + + Returned dict is sorted by name, with workers sorted by their numeric suffix. + This ensures consistent ordering for SSH configuration and other operations. + """ non_included_pod_statuses = POD_STATUSES.copy() field_selector = '' @@ -3154,7 +3158,27 @@ def filter_pods(namespace: str, pods = [ pod for pod in pod_list.items if pod.metadata.deletion_timestamp is None ] - return {pod.metadata.name: pod for pod in pods} + + # Sort pods by name, with workers sorted by their numeric suffix. + # This ensures consistent ordering (e.g., cluster-head, cluster-worker1, + # worker2, worker3, ...) even when Kubernetes API returns them in + # arbitrary order. This works even if there were somehow pod names other + # than head/worker ones, but that may be overkill. + def get_pod_sort_key(pod): + name = pod.metadata.name + if '-worker' in name: + try: + return (1, int(name.split('-worker')[-1])) + except (ValueError, IndexError): + return (2, name) + elif '-head' in name: + return (0, name) + else: + return (2, name) + + sorted_pods = sorted(pods, key=get_pod_sort_key) + + return {pod.metadata.name: pod for pod in sorted_pods} def _remove_pod_annotation(pod: Any, diff --git a/tests/unit_tests/kubernetes/test_kubernetes_utils.py b/tests/unit_tests/kubernetes/test_kubernetes_utils.py index a39a943075e..48ccbf66310 100644 --- a/tests/unit_tests/kubernetes/test_kubernetes_utils.py +++ b/tests/unit_tests/kubernetes/test_kubernetes_utils.py @@ -4,6 +4,7 @@ import collections import os +import re import tempfile from typing import Optional import unittest @@ -1913,3 +1914,104 @@ def track_calls(cloud, assert len(custom_metadata_calls) >= 1 assert custom_metadata_calls[0]['cloud'] == 'ssh', \ "custom_metadata should use SSH cloud" + + +def test_filter_pods_sorts_by_name(): + """Test that filter_pods returns pods sorted correctly""" + # Head pod should come first, then worker pods sorted by their numeric + # suffix, then any other pods alphabetically. + + # Create mock pods with out-of-order names + mock_pod_worker4 = mock.MagicMock() + mock_pod_worker4.metadata.name = 'test-cluster-worker4' + mock_pod_worker4.metadata.deletion_timestamp = None + + mock_pod_worker1 = mock.MagicMock() + mock_pod_worker1.metadata.name = 'test-cluster-worker1' + mock_pod_worker1.metadata.deletion_timestamp = None + + mock_pod_head = mock.MagicMock() + mock_pod_head.metadata.name = 'test-cluster-head' + mock_pod_head.metadata.deletion_timestamp = None + + mock_pod_worker3 = mock.MagicMock() + mock_pod_worker3.metadata.name = 'test-cluster-worker3' + mock_pod_worker3.metadata.deletion_timestamp = None + + mock_pod_worker2 = mock.MagicMock() + mock_pod_worker2.metadata.name = 'test-cluster-worker2' + mock_pod_worker2.metadata.deletion_timestamp = None + + # Mock pod list returned in arbitrary order + mock_pod_list = mock.MagicMock() + mock_pod_list.items = [ + mock_pod_worker4, + mock_pod_worker1, + mock_pod_worker3, + mock_pod_head, + mock_pod_worker2, + ] + + with patch('sky.provision.kubernetes.utils.kubernetes.core_api' + ) as mock_core_api: + mock_core_api.return_value.list_namespaced_pod.return_value = mock_pod_list + + result = utils.filter_pods(namespace='test-namespace', + context='test-context', + tag_filters={'test-label': 'test-value'}) + + # Verify the pods are returned in sorted order + pod_names = list(result.keys()) + assert pod_names == [ + 'test-cluster-head', + 'test-cluster-worker1', + 'test-cluster-worker2', + 'test-cluster-worker3', + 'test-cluster-worker4', + ] + + +def test_filter_pods_handles_gaps_in_worker_numbers(): + """Test that filter_pods correctly sorts workers even with gaps in numbering""" + # Create mock pods with gaps (worker1, worker2, worker5) + mock_pod_worker5 = mock.MagicMock() + mock_pod_worker5.metadata.name = 'test-cluster-worker5' + mock_pod_worker5.metadata.deletion_timestamp = None + + mock_pod_worker2 = mock.MagicMock() + mock_pod_worker2.metadata.name = 'test-cluster-worker2' + mock_pod_worker2.metadata.deletion_timestamp = None + + mock_pod_head = mock.MagicMock() + mock_pod_head.metadata.name = 'test-cluster-head' + mock_pod_head.metadata.deletion_timestamp = None + + mock_pod_worker1 = mock.MagicMock() + mock_pod_worker1.metadata.name = 'test-cluster-worker1' + mock_pod_worker1.metadata.deletion_timestamp = None + + # Mock pod list in arbitrary order + mock_pod_list = mock.MagicMock() + mock_pod_list.items = [ + mock_pod_worker5, + mock_pod_worker2, + mock_pod_head, + mock_pod_worker1, + ] + + with patch('sky.provision.kubernetes.utils.kubernetes.core_api' + ) as mock_core_api: + mock_core_api.return_value.list_namespaced_pod.return_value = mock_pod_list + + result = utils.filter_pods(namespace='test-namespace', + context='test-context', + tag_filters={'test-label': 'test-value'}) + + # Verify the pods are returned in sorted order with gaps preserved + pod_names = list(result.keys()) + assert pod_names == [ + 'test-cluster-head', + 'test-cluster-worker1', + 'test-cluster-worker2', + 'test-cluster-worker5', + ] From e2379519abfb08417462d00bc1ad2dc601065713 Mon Sep 17 00:00:00 2001 From: Dan Blanchard Date: Mon, 24 Nov 2025 19:20:58 -0500 Subject: [PATCH 2/4] add type signatures to get_pod_sort_key --- sky/provision/kubernetes/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index d4d903ea062..83b793a73c5 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -3164,7 +3164,7 @@ def filter_pods(namespace: str, # worker2, worker3, ...) even when Kubernetes API returns them in # arbitrary order. This works even if there were somehow pod names other # than head/worker ones, but that may be overkill. - def get_pod_sort_key(pod): + def get_pod_sort_key(pod: V1Pod) -> Tuple[int, Union[int, str]]: name = pod.metadata.name if '-worker' in name: try: From dfa66bcb35d810dafbb5bad6863dd07662f6e60b Mon Sep 17 00:00:00 2001 From: Dan Blanchard Date: Wed, 26 Nov 2025 00:10:17 -0500 Subject: [PATCH 3/4] Make get_pod_sort_key more accurate --- sky/provision/kubernetes/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 83b793a73c5..5925ad9985e 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -14,7 +14,8 @@ import subprocess import time import typing -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union +from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple, + Union) import ijson @@ -3164,7 +3165,10 @@ def filter_pods(namespace: str, # worker2, worker3, ...) even when Kubernetes API returns them in # arbitrary order. This works even if there were somehow pod names other # than head/worker ones, but that may be overkill. - def get_pod_sort_key(pod: V1Pod) -> Tuple[int, Union[int, str]]: + def get_pod_sort_key( + pod: V1Pod + ) -> Union[Tuple[Literal[0], str], Tuple[Literal[1], int], Tuple[Literal[2], + str]]: name = pod.metadata.name if '-worker' in name: try: From 12dbd4bfa649ce5f3945b14c3e4196c7ddf2bda5 Mon Sep 17 00:00:00 2001 From: Dan Blanchard Date: Wed, 26 Nov 2025 11:11:41 -0500 Subject: [PATCH 4/4] Handle "-worker" in cluster names in get_pod_sort_key --- sky/provision/kubernetes/utils.py | 16 +- .../kubernetes/test_kubernetes_utils.py | 139 +++++++----------- 2 files changed, 62 insertions(+), 93 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 5925ad9985e..cc321b4e589 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -3162,21 +3162,23 @@ def filter_pods(namespace: str, # Sort pods by name, with workers sorted by their numeric suffix. # This ensures consistent ordering (e.g., cluster-head, cluster-worker1, - # worker2, worker3, ...) even when Kubernetes API returns them in - # arbitrary order. This works even if there were somehow pod names other - # than head/worker ones, but that may be overkill. + # cluster-worker2, cluster-worker3, ...) even when Kubernetes API + # returns them in arbitrary order. This works even if there were + # somehow pod names other than head/worker ones, and those end up at + # the end of the list. def get_pod_sort_key( pod: V1Pod ) -> Union[Tuple[Literal[0], str], Tuple[Literal[1], int], Tuple[Literal[2], str]]: name = pod.metadata.name - if '-worker' in name: + name_suffix = name.split('-')[-1] + if name_suffix == 'head': + return (0, name) + elif name_suffix.startswith('worker'): try: - return (1, int(name.split('-worker')[-1])) + return (1, int(name_suffix.split('worker')[-1])) except (ValueError, IndexError): return (2, name) - elif '-head' in name: - return (0, name) else: return (2, name) diff --git a/tests/unit_tests/kubernetes/test_kubernetes_utils.py b/tests/unit_tests/kubernetes/test_kubernetes_utils.py index 48ccbf66310..c392482c2be 100644 --- a/tests/unit_tests/kubernetes/test_kubernetes_utils.py +++ b/tests/unit_tests/kubernetes/test_kubernetes_utils.py @@ -1916,41 +1916,60 @@ def track_calls(cloud, "custom_metadata should use SSH cloud" -def test_filter_pods_sorts_by_name(): +@pytest.mark.parametrize('unsorted_pod_names, expected_sorted_pod_names', [ + ([ + 'test-cluster-worker10', 'test-cluster-worker2', 'test-cluster-head', + 'test-cluster-worker1', 'test-cluster-worker3' + ], [ + 'test-cluster-head', 'test-cluster-worker1', 'test-cluster-worker2', + 'test-cluster-worker3', 'test-cluster-worker10' + ]), + ([ + 'test-cluster-worker1', 'test-cluster-worker20', 'test-cluster-head', + 'test-cluster-worker3', 'test-cluster-worker2' + ], [ + 'test-cluster-head', 'test-cluster-worker1', 'test-cluster-worker2', + 'test-cluster-worker3', 'test-cluster-worker20' + ]), + ([ + 'test-cluster-worker1', 'test-cluster-worker2', 'test-cluster-head', + 'test-cluster-worker3', 'test-cluster-worker4' + ], [ + 'test-cluster-head', 'test-cluster-worker1', 'test-cluster-worker2', + 'test-cluster-worker3', 'test-cluster-worker4' + ]), + ([ + 'test-cluster-head', 'test-cluster-worker1', 'test-cluster-worker2', + 'test-cluster-worker3', 'test-cluster-worker4' + ], [ + 'test-cluster-head', 'test-cluster-worker1', 'test-cluster-worker2', + 'test-cluster-worker3', 'test-cluster-worker4' + ]), + ([ + 'my-worker-head', 'my-worker-worker1', 'my-worker-worker2', + 'my-worker-worker3', 'my-worker-worker4' + ], [ + 'my-worker-head', 'my-worker-worker1', 'my-worker-worker2', + 'my-worker-worker3', 'my-worker-worker4' + ]), + ([ + 'my-worker-head', 'my-worker-worker1', 'extra-pod', 'my-worker-worker2', + 'my-worker-worker3', 'my-worker-worker4' + ], [ + 'my-worker-head', 'my-worker-worker1', 'my-worker-worker2', + 'my-worker-worker3', 'my-worker-worker4', 'extra-pod' + ]), +]) +def test_filter_pods_sorts_by_name(unsorted_pod_names, + expected_sorted_pod_names): """Test that filter_pods returns pods sorted correctly""" - # Head pod should come first, then worker pods sorted by their numeric - # suffix, then any other pods alphabetically. - - # Create mock pods with out-of-order names - mock_pod_worker4 = mock.MagicMock() - mock_pod_worker4.metadata.name = 'test-cluster-worker4' - mock_pod_worker4.metadata.deletion_timestamp = None - - mock_pod_worker1 = mock.MagicMock() - mock_pod_worker1.metadata.name = 'test-cluster-worker1' - mock_pod_worker1.metadata.deletion_timestamp = None - - mock_pod_head = mock.MagicMock() - mock_pod_head.metadata.name = 'test-cluster-head' - mock_pod_head.metadata.deletion_timestamp = None - - mock_pod_worker3 = mock.MagicMock() - mock_pod_worker3.metadata.name = 'test-cluster-worker3' - mock_pod_worker3.metadata.deletion_timestamp = None - - mock_pod_worker2 = mock.MagicMock() - mock_pod_worker2.metadata.name = 'test-cluster-worker2' - mock_pod_worker2.metadata.deletion_timestamp = None - - # Mock pod list returned in arbitrary order mock_pod_list = mock.MagicMock() - mock_pod_list.items = [ - mock_pod_worker4, - mock_pod_worker1, - mock_pod_worker3, - mock_pod_head, - mock_pod_worker2, - ] + mock_pod_list.items = [] + for pod_name in unsorted_pod_names: + mock_pod = mock.MagicMock() + mock_pod.metadata.name = pod_name + mock_pod.metadata.deletion_timestamp = None + mock_pod_list.items.append(mock_pod) with patch('sky.provision.kubernetes.utils.kubernetes.core_api' ) as mock_core_api: @@ -1962,56 +1981,4 @@ def test_filter_pods_sorts_by_name(): # Verify the pods are returned in sorted order pod_names = list(result.keys()) - assert pod_names == [ - 'test-cluster-head', - 'test-cluster-worker1', - 'test-cluster-worker2', - 'test-cluster-worker3', - 'test-cluster-worker4', - ] - - -def test_filter_pods_handles_gaps_in_worker_numbers(): - """Test that filter_pods correctly sorts workers even with gaps in numbering""" - # Create mock pods with gaps (worker1, worker2, worker5) - mock_pod_worker5 = mock.MagicMock() - mock_pod_worker5.metadata.name = 'test-cluster-worker5' - mock_pod_worker5.metadata.deletion_timestamp = None - - mock_pod_worker2 = mock.MagicMock() - mock_pod_worker2.metadata.name = 'test-cluster-worker2' - mock_pod_worker2.metadata.deletion_timestamp = None - - mock_pod_head = mock.MagicMock() - mock_pod_head.metadata.name = 'test-cluster-head' - mock_pod_head.metadata.deletion_timestamp = None - - mock_pod_worker1 = mock.MagicMock() - mock_pod_worker1.metadata.name = 'test-cluster-worker1' - mock_pod_worker1.metadata.deletion_timestamp = None - - # Mock pod list in arbitrary order - mock_pod_list = mock.MagicMock() - mock_pod_list.items = [ - mock_pod_worker5, - mock_pod_worker2, - mock_pod_head, - mock_pod_worker1, - ] - - with patch('sky.provision.kubernetes.utils.kubernetes.core_api' - ) as mock_core_api: - mock_core_api.return_value.list_namespaced_pod.return_value = mock_pod_list - - result = utils.filter_pods(namespace='test-namespace', - context='test-context', - tag_filters={'test-label': 'test-value'}) - - # Verify the pods are returned in sorted order with gaps preserved - pod_names = list(result.keys()) - assert pod_names == [ - 'test-cluster-head', - 'test-cluster-worker1', - 'test-cluster-worker2', - 'test-cluster-worker5', - ] + assert pod_names == expected_sorted_pod_names