Skip to content

Commit b529b27

Browse files
lloyd-browncg505
authored andcommitted
[Core] Ensure --retry-until-up Tries Launch After Checking All Zones (#8079)
* Retry if launch failed. * Fix at API server level. * Add smoke test. * Remove unnecessary statements. * Assert instead of log.
1 parent cc44415 commit b529b27

File tree

3 files changed

+155
-0
lines changed

3 files changed

+155
-0
lines changed

sky/server/requests/executor.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,9 +270,16 @@ def handle_task_result(self, fut: concurrent.futures.Future,
270270
queue.put(request_element)
271271
except exceptions.ExecutionRetryableError as e:
272272
time.sleep(e.retry_wait_seconds)
273+
# Reset the request status to PENDING so it can be picked up again.
274+
# Assume retryable since the error is ExecutionRetryableError.
275+
request_id, _, _ = request_element
276+
with api_requests.update_request(request_id) as request_task:
277+
assert request_task is not None, request_id
278+
request_task.status = api_requests.RequestStatus.PENDING
273279
# Reschedule the request.
274280
queue = _get_queue(self.schedule_type)
275281
queue.put(request_element)
282+
logger.info(f'Rescheduled request {request_id} for retry')
276283
finally:
277284
# Increment the free executor count when a request finishes
278285
if metrics_utils.METRICS_ENABLED:

tests/smoke_tests/test_basic.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1775,3 +1775,27 @@ def test_cluster_setup_num_gpus():
17751775
teardown=f'sky down -y {name}',
17761776
)
17771777
smoke_tests_utils.run_one_test(test)
1778+
1779+
1780+
@pytest.mark.aws
1781+
def test_launch_retry_until_up():
1782+
"""Test that retry until up considers more resources after trying all zones."""
1783+
cluster_name = smoke_tests_utils.get_cluster_name()
1784+
timeout = 180
1785+
test = smoke_tests_utils.Test(
1786+
'launch-retry-until-up',
1787+
[
1788+
# Launch something we'll never get.
1789+
f's=$(timeout {timeout} sky launch -c {cluster_name} --gpus B200:8 --infra aws echo hi -y -d --retry-until-up --use-spot 2>&1 || true) && '
1790+
# Check that "Retry after" appears in the output
1791+
'echo "$s" | grep -q "Retry after" && '
1792+
# Find the first occurrence of "Retry after" and get its line number
1793+
'RETRY_LINE=$(echo "$s" | grep -n "Retry after" | head -1 | cut -d: -f1) && '
1794+
# Check that "Considered resources" appears after the first "Retry after"
1795+
# We do this by extracting all lines after RETRY_LINE and checking if "Considered resources" appears
1796+
'echo "$s" | tail -n +$((RETRY_LINE + 1)) | grep -q "Considered resources"'
1797+
],
1798+
timeout=200, # Slightly more than 180 to account for test overhead
1799+
teardown=f'sky down -y {cluster_name}',
1800+
)
1801+
smoke_tests_utils.run_one_test(test)

tests/unit_tests/test_sky/server/requests/test_executor.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
"""Unit tests for sky.server.requests.executor module."""
22
import asyncio
3+
import concurrent.futures
34
import functools
45
import os
6+
import queue as queue_lib
57
import time
68
from typing import List
79
from unittest import mock
@@ -10,6 +12,7 @@
1012

1113
from sky import exceptions
1214
from sky import skypilot_config
15+
from sky.server import config as server_config
1316
from sky.server import constants as server_constants
1417
from sky.server.requests import executor
1518
from sky.server.requests import payloads
@@ -452,6 +455,11 @@ def _keyboard_interrupt_entrypoint():
452455
raise KeyboardInterrupt()
453456

454457

458+
def _dummy_entrypoint_for_retry_test():
459+
"""Dummy entrypoint for retry test that can be pickled."""
460+
return None
461+
462+
455463
@pytest.mark.asyncio
456464
@pytest.mark.parametrize('test_case', [
457465
pytest.param(
@@ -518,3 +526,119 @@ async def test_stdout_stderr_restoration(mock_fd_operations, test_case):
518526
# Verify no double-close
519527
_assert_no_double_close(mock_fd_operations['close_calls'],
520528
mock_fd_operations['created_fds'])
529+
530+
531+
@pytest.mark.asyncio
532+
async def test_request_worker_retry_execution_retryable_error(
533+
isolated_database, monkeypatch):
534+
"""Test that RequestWorker retries requests when ExecutionRetryableError is raised."""
535+
# Create a request in the database
536+
request_id = 'test-retry-request'
537+
request = requests_lib.Request(
538+
request_id=request_id,
539+
name='test-request',
540+
entrypoint=
541+
_dummy_entrypoint_for_retry_test, # Won't be called in this test
542+
request_body=payloads.RequestBody(),
543+
status=requests_lib.RequestStatus.RUNNING,
544+
created_at=time.time(),
545+
user_id='test-user',
546+
)
547+
await requests_lib.create_if_not_exists_async(request)
548+
549+
# Create a mock queue that tracks puts
550+
queue_items = []
551+
mock_queue = queue_lib.Queue()
552+
553+
class MockRequestQueue:
554+
555+
def __init__(self, queue):
556+
self.queue = queue
557+
558+
def get(self):
559+
try:
560+
return self.queue.get(block=False)
561+
except queue_lib.Empty:
562+
return None
563+
564+
def put(self, item):
565+
queue_items.append(item)
566+
self.queue.put(item)
567+
568+
request_queue = MockRequestQueue(mock_queue)
569+
570+
# Mock _get_queue to return our mock queue
571+
def mock_get_queue(schedule_type):
572+
return request_queue
573+
574+
monkeypatch.setattr(executor, '_get_queue', mock_get_queue)
575+
576+
# Mock time.sleep to track calls (but still sleep for very short waits)
577+
sleep_calls = []
578+
579+
def mock_sleep(seconds):
580+
sleep_calls.append(seconds)
581+
582+
monkeypatch.setattr('time.sleep', mock_sleep)
583+
584+
# Create a mock executor that tracks submit_until_success calls
585+
submit_calls = []
586+
587+
class MockExecutor:
588+
589+
def submit_until_success(self, fn, *args, **kwargs):
590+
submit_calls.append((fn, args, kwargs))
591+
# Return a future that immediately completes (does nothing)
592+
fut = concurrent.futures.Future()
593+
fut.set_result(None)
594+
return fut
595+
596+
mock_executor = MockExecutor()
597+
598+
# Create a RequestWorker
599+
worker = executor.RequestWorker(
600+
schedule_type=requests_lib.ScheduleType.LONG,
601+
config=server_config.WorkerConfig(garanteed_parallelism=1,
602+
burstable_parallelism=0,
603+
num_db_connections_per_worker=0))
604+
605+
# Create a future that raises ExecutionRetryableError
606+
retryable_error = exceptions.ExecutionRetryableError(
607+
'Failed to provision all possible launchable resources.',
608+
hint='Retry after 30s',
609+
retry_wait_seconds=30)
610+
fut = concurrent.futures.Future()
611+
fut.set_exception(retryable_error)
612+
613+
# Create request_element tuple
614+
request_element = (request_id, False, True
615+
) # (request_id, ignore_return_value, retryable)
616+
617+
# Call handle_task_result - this should catch the exception and reschedule
618+
worker.handle_task_result(fut, request_element)
619+
620+
# Verify the request was put back on the queue
621+
assert queue_items == [
622+
request_element
623+
], (f'Expected {request_element} to be put on queue, got {queue_items[0]}')
624+
625+
# Verify time.sleep was called with the retry wait time (first call should be 30)
626+
assert sleep_calls == [
627+
30
628+
], (f'Expected first time.sleep call to be 30 seconds, got {sleep_calls[0]}'
629+
)
630+
631+
# Verify the request status was reset to PENDING
632+
updated_request = requests_lib.get_request(request_id, fields=['status'])
633+
assert updated_request is not None
634+
assert updated_request.status == requests_lib.RequestStatus.PENDING, (
635+
f'Expected request status to be PENDING, got {updated_request.status}')
636+
637+
# Call process_request - it should pick up the request from the queue
638+
# and call submit_until_success
639+
worker.process_request(mock_executor, request_queue)
640+
641+
# Verify submit_until_success was called
642+
assert len(submit_calls) == 1, (
643+
f'Expected submit_until_success to be called once, got {len(submit_calls)} calls'
644+
)

0 commit comments

Comments
 (0)