From a72774c71f515a416b2a711d7576f85fefa72e47 Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Mon, 24 Nov 2025 15:22:21 -0800 Subject: [PATCH 1/4] ignore restart file on the first run --- sky/jobs/utils.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/sky/jobs/utils.py b/sky/jobs/utils.py index b6ecb17af10..1e4d4eae23d 100644 --- a/sky/jobs/utils.py +++ b/sky/jobs/utils.py @@ -233,14 +233,11 @@ def is_consolidation_mode(on_api_restart: bool = False) -> bool: signal_file = pathlib.Path( _JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE).expanduser() - restart_signal_file_exists = signal_file.exists() - consolidation_mode = (config_consolidation_mode and - restart_signal_file_exists) - if on_api_restart: if config_consolidation_mode: signal_file.touch() else: + restart_signal_file_exists = signal_file.exists() if not restart_signal_file_exists: if config_consolidation_mode: logger.warning(f'{colorama.Fore.YELLOW}Consolidation mode for ' @@ -259,8 +256,8 @@ def is_consolidation_mode(on_api_restart: bool = False) -> bool: # have related config and will always seemingly disabled for consolidation # mode. Check #6611 for more details. if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None: - _validate_consolidation_mode_config(consolidation_mode) - return consolidation_mode + _validate_consolidation_mode_config(config_consolidation_mode) + return config_consolidation_mode def ha_recovery_for_consolidation_mode() -> None: From dfa985e61dc079ac31a03b11be1132d6b48e58e9 Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Mon, 24 Nov 2025 15:34:28 -0800 Subject: [PATCH 2/4] avoid crashing the server on inconsistent consolidation mode config --- sky/backends/backend_utils.py | 2 +- sky/jobs/server/core.py | 2 +- sky/jobs/utils.py | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 67c2fe660b0..8d7268d74a9 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2987,7 +2987,7 @@ def is_controller_accessible( exceptions.ClusterNotUpError: if the controller is not accessible, or failed to be connected. """ - if (managed_job_utils.is_consolidation_mode() and + if (managed_job_utils.is_consolidation_mode(validate=True) and controller == controller_utils.Controllers.JOBS_CONTROLLER ) or (serve_utils.is_consolidation_mode() and controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER): diff --git a/sky/jobs/server/core.py b/sky/jobs/server/core.py index 58b92fe65d5..f76a4bb9456 100644 --- a/sky/jobs/server/core.py +++ b/sky/jobs/server/core.py @@ -257,7 +257,7 @@ def launch( # to the jobs controller. for example if the postgres is whitelisted to # only the API server, this will then break. the simple solution to that is # telling the user to add the jobs controller to the postgres whitelist. - if not managed_job_utils.is_consolidation_mode(): + if not managed_job_utils.is_consolidation_mode(validate=True): db_path = mutated_user_config.get('db', None) if db_path is not None: parsed = urlparse.urlparse(db_path) diff --git a/sky/jobs/utils.py b/sky/jobs/utils.py index 1e4d4eae23d..20a67eab139 100644 --- a/sky/jobs/utils.py +++ b/sky/jobs/utils.py @@ -223,7 +223,7 @@ def _validate_consolidation_mode_config( # directly in the API Server. # Use LRU Cache so that the check is only done once. @annotations.lru_cache(scope='request', maxsize=2) -def is_consolidation_mode(on_api_restart: bool = False) -> bool: +def is_consolidation_mode(on_api_restart: bool = False, validate=False) -> bool: if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None: return True @@ -255,7 +255,8 @@ def is_consolidation_mode(on_api_restart: bool = False) -> bool: # We should only do this check on API server, as the controller will not # have related config and will always seemingly disabled for consolidation # mode. Check #6611 for more details. - if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None: + if validate and os.environ.get( + constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None: _validate_consolidation_mode_config(config_consolidation_mode) return config_consolidation_mode From a1c0fca0f57c5ab9ce149b025b9f90f4ba29c696 Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Mon, 24 Nov 2025 15:53:59 -0800 Subject: [PATCH 3/4] Revert "avoid crashing the server on inconsistent consolidation mode config" This reverts commit dfa985e61dc079ac31a03b11be1132d6b48e58e9. --- sky/backends/backend_utils.py | 2 +- sky/jobs/server/core.py | 2 +- sky/jobs/utils.py | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 8d7268d74a9..67c2fe660b0 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -2987,7 +2987,7 @@ def is_controller_accessible( exceptions.ClusterNotUpError: if the controller is not accessible, or failed to be connected. """ - if (managed_job_utils.is_consolidation_mode(validate=True) and + if (managed_job_utils.is_consolidation_mode() and controller == controller_utils.Controllers.JOBS_CONTROLLER ) or (serve_utils.is_consolidation_mode() and controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER): diff --git a/sky/jobs/server/core.py b/sky/jobs/server/core.py index f76a4bb9456..58b92fe65d5 100644 --- a/sky/jobs/server/core.py +++ b/sky/jobs/server/core.py @@ -257,7 +257,7 @@ def launch( # to the jobs controller. for example if the postgres is whitelisted to # only the API server, this will then break. the simple solution to that is # telling the user to add the jobs controller to the postgres whitelist. - if not managed_job_utils.is_consolidation_mode(validate=True): + if not managed_job_utils.is_consolidation_mode(): db_path = mutated_user_config.get('db', None) if db_path is not None: parsed = urlparse.urlparse(db_path) diff --git a/sky/jobs/utils.py b/sky/jobs/utils.py index 20a67eab139..1e4d4eae23d 100644 --- a/sky/jobs/utils.py +++ b/sky/jobs/utils.py @@ -223,7 +223,7 @@ def _validate_consolidation_mode_config( # directly in the API Server. # Use LRU Cache so that the check is only done once. @annotations.lru_cache(scope='request', maxsize=2) -def is_consolidation_mode(on_api_restart: bool = False, validate=False) -> bool: +def is_consolidation_mode(on_api_restart: bool = False) -> bool: if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None: return True @@ -255,8 +255,7 @@ def is_consolidation_mode(on_api_restart: bool = False, validate=False) -> bool: # We should only do this check on API server, as the controller will not # have related config and will always seemingly disabled for consolidation # mode. Check #6611 for more details. - if validate and os.environ.get( - constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None: + if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None: _validate_consolidation_mode_config(config_consolidation_mode) return config_consolidation_mode From 6fd3a71f17b65f1b18ee6ddb0afa48380e2a7b20 Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Mon, 24 Nov 2025 15:57:06 -0800 Subject: [PATCH 4/4] only use a warning for inconsistent consolidation mode --- sky/client/cli/command.py | 43 --------------------------------------- sky/exceptions.py | 6 ------ sky/jobs/utils.py | 24 +++++++++------------- sky/serve/serve_utils.py | 24 ++++++++++------------ 4 files changed, 21 insertions(+), 76 deletions(-) diff --git a/sky/client/cli/command.py b/sky/client/cli/command.py index 2a641398b91..055c338e340 100644 --- a/sky/client/cli/command.py +++ b/sky/client/cli/command.py @@ -3040,34 +3040,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str, # there is no in-prgress managed jobs. managed_jobs_ = [] pools_ = [] - except exceptions.InconsistentConsolidationModeError: - # If this error is raised, it means the user switched to the - # consolidation mode but the previous controller cluster is still - # running. We should allow the user to tear down the controller - # cluster in this case. - with skypilot_config.override_skypilot_config( - {'jobs': { - 'controller': { - 'consolidation_mode': False - } - }}): - # Check again with the consolidation mode disabled. This is to - # make sure there is no in-progress managed jobs. - request_id, queue_result_version = ( - cli_utils.get_managed_job_queue( - refresh=False, - skip_finished=True, - all_users=True, - fields=fields, - )) - result = sdk.stream_and_get(request_id) - if queue_result_version.v2(): - managed_jobs_, _, status_counts, _ = result - else: - managed_jobs_ = typing.cast( - List[responses.ManagedJobRecord], result) - request_id_pools = managed_jobs.pool_status(pool_names=None) - pools_ = sdk.stream_and_get(request_id_pools) msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed ' 'jobs controller. Please be aware of the following:' @@ -3144,21 +3116,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str, # controller being STOPPED or being firstly launched, i.e., there is # no in-prgress services. services = [] - except exceptions.InconsistentConsolidationModeError: - # If this error is raised, it means the user switched to the - # consolidation mode but the previous controller cluster is still - # running. We should allow the user to tear down the controller - # cluster in this case. - with skypilot_config.override_skypilot_config( - {'serve': { - 'controller': { - 'consolidation_mode': False - } - }}): - # Check again with the consolidation mode disabled. This is to - # make sure there is no in-progress services. - request_id = serve_lib.status(service_names=None) - services = sdk.stream_and_get(request_id) if services: service_names = [service['name'] for service in services] diff --git a/sky/exceptions.py b/sky/exceptions.py index 377f90a5df8..a2714f9b5ed 100644 --- a/sky/exceptions.py +++ b/sky/exceptions.py @@ -208,12 +208,6 @@ class InconsistentHighAvailabilityError(Exception): pass -class InconsistentConsolidationModeError(Exception): - """Raised when the consolidation mode property in the user config - is inconsistent with the actual cluster.""" - pass - - class ProvisionPrechecksError(Exception): """Raised when a managed job fails prechecks before provision. diff --git a/sky/jobs/utils.py b/sky/jobs/utils.py index 1e4d4eae23d..d71a6956cf3 100644 --- a/sky/jobs/utils.py +++ b/sky/jobs/utils.py @@ -186,13 +186,11 @@ def _validate_consolidation_mode_config( controller_cn = ( controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name) if global_user_state.cluster_with_name_exists(controller_cn): - with ux_utils.print_exception_no_traceback(): - raise exceptions.InconsistentConsolidationModeError( - f'{colorama.Fore.RED}Consolidation mode for jobs is ' - f'enabled, but the controller cluster ' - f'{controller_cn} is still running. Please ' - 'terminate the controller cluster first.' - f'{colorama.Style.RESET_ALL}') + logger.warning( + f'{colorama.Fore.RED}Consolidation mode for jobs is enabled, ' + f'but the controller cluster {controller_cn} is still running. ' + 'Please terminate the controller cluster first.' + f'{colorama.Style.RESET_ALL}') else: total_jobs = managed_job_state.get_managed_jobs_total() if total_jobs > 0: @@ -200,13 +198,11 @@ def _validate_consolidation_mode_config( managed_job_state.get_nonterminal_job_ids_by_name( None, None, all_users=True)) if nonterminal_jobs: - with ux_utils.print_exception_no_traceback(): - raise exceptions.InconsistentConsolidationModeError( - f'{colorama.Fore.RED}Consolidation mode ' - 'is disabled, but there are still ' - f'{len(nonterminal_jobs)} managed jobs ' - 'running. Please terminate those jobs ' - f'first.{colorama.Style.RESET_ALL}') + logger.warning( + f'{colorama.Fore.YELLOW}Consolidation mode is disabled, ' + f'but there are still {len(nonterminal_jobs)} managed jobs ' + 'running. Please terminate those jobs first.' + f'{colorama.Style.RESET_ALL}') else: logger.warning( f'{colorama.Fore.YELLOW}Consolidation mode is disabled, ' diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 8dbbd0ad1af..80cf8b7362c 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -218,25 +218,23 @@ def _validate_consolidation_mode_config(current_is_consolidation_mode: bool, if current_is_consolidation_mode: controller_cn = controller.cluster_name if global_user_state.cluster_with_name_exists(controller_cn): - with ux_utils.print_exception_no_traceback(): - raise exceptions.InconsistentConsolidationModeError( - f'{colorama.Fore.RED}Consolidation mode for ' - f'{controller.controller_type} is enabled, but the ' - f'controller cluster {controller_cn} is still running. ' - 'Please terminate the controller cluster first.' - f'{colorama.Style.RESET_ALL}') + logger.warning( + f'{colorama.Fore.RED}Consolidation mode for ' + f'{controller.controller_type} is enabled, but the controller ' + f'cluster {controller_cn} is still running. Please terminate ' + 'the controller cluster first.' + f'{colorama.Style.RESET_ALL}') else: noun = 'pool' if pool else 'service' all_services = [ svc for svc in serve_state.get_services() if svc['pool'] == pool ] if all_services: - with ux_utils.print_exception_no_traceback(): - raise exceptions.InconsistentConsolidationModeError( - f'{colorama.Fore.RED}Consolidation mode for ' - f'{controller.controller_type} is disabled, but there are ' - f'still {len(all_services)} {noun}s running. Please ' - f'terminate those {noun}s first.{colorama.Style.RESET_ALL}') + logger.warning( + f'{colorama.Fore.RED}Consolidation mode for ' + f'{controller.controller_type} is disabled, but there are ' + f'still {len(all_services)} {noun}s running. Please terminate ' + f'those {noun}s first.{colorama.Style.RESET_ALL}') @annotations.lru_cache(scope='request', maxsize=1)