Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 0 additions & 43 deletions sky/client/cli/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -3040,34 +3040,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
# there is no in-prgress managed jobs.
managed_jobs_ = []
pools_ = []
except exceptions.InconsistentConsolidationModeError:
# If this error is raised, it means the user switched to the
# consolidation mode but the previous controller cluster is still
# running. We should allow the user to tear down the controller
# cluster in this case.
with skypilot_config.override_skypilot_config(
{'jobs': {
'controller': {
'consolidation_mode': False
}
}}):
# Check again with the consolidation mode disabled. This is to
# make sure there is no in-progress managed jobs.
request_id, queue_result_version = (
cli_utils.get_managed_job_queue(
refresh=False,
skip_finished=True,
all_users=True,
fields=fields,
))
result = sdk.stream_and_get(request_id)
if queue_result_version.v2():
managed_jobs_, _, status_counts, _ = result
else:
managed_jobs_ = typing.cast(
List[responses.ManagedJobRecord], result)
request_id_pools = managed_jobs.pool_status(pool_names=None)
pools_ = sdk.stream_and_get(request_id_pools)

msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
'jobs controller. Please be aware of the following:'
Expand Down Expand Up @@ -3144,21 +3116,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
# controller being STOPPED or being firstly launched, i.e., there is
# no in-prgress services.
services = []
except exceptions.InconsistentConsolidationModeError:
# If this error is raised, it means the user switched to the
# consolidation mode but the previous controller cluster is still
# running. We should allow the user to tear down the controller
# cluster in this case.
with skypilot_config.override_skypilot_config(
{'serve': {
'controller': {
'consolidation_mode': False
}
}}):
# Check again with the consolidation mode disabled. This is to
# make sure there is no in-progress services.
request_id = serve_lib.status(service_names=None)
services = sdk.stream_and_get(request_id)

if services:
service_names = [service['name'] for service in services]
Expand Down
6 changes: 0 additions & 6 deletions sky/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,12 +208,6 @@ class InconsistentHighAvailabilityError(Exception):
pass


class InconsistentConsolidationModeError(Exception):
"""Raised when the consolidation mode property in the user config
is inconsistent with the actual cluster."""
pass


class ProvisionPrechecksError(Exception):
"""Raised when a managed job fails prechecks before provision.

Expand Down
33 changes: 13 additions & 20 deletions sky/jobs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,27 +186,23 @@ def _validate_consolidation_mode_config(
controller_cn = (
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
if global_user_state.cluster_with_name_exists(controller_cn):
with ux_utils.print_exception_no_traceback():
raise exceptions.InconsistentConsolidationModeError(
f'{colorama.Fore.RED}Consolidation mode for jobs is '
f'enabled, but the controller cluster '
f'{controller_cn} is still running. Please '
'terminate the controller cluster first.'
f'{colorama.Style.RESET_ALL}')
logger.warning(
f'{colorama.Fore.RED}Consolidation mode for jobs is enabled, '
f'but the controller cluster {controller_cn} is still running. '
'Please terminate the controller cluster first.'
f'{colorama.Style.RESET_ALL}')
Comment on lines +189 to +193
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

feeling like we should print out how many running jobs are still on the job controller here - just to get a sense of how many resources are still on the old deployment. same applies for serve.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will make this call much slower, so I think it could be a problem.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. Lets just keep this :)

else:
total_jobs = managed_job_state.get_managed_jobs_total()
if total_jobs > 0:
nonterminal_jobs = (
managed_job_state.get_nonterminal_job_ids_by_name(
None, None, all_users=True))
if nonterminal_jobs:
with ux_utils.print_exception_no_traceback():
raise exceptions.InconsistentConsolidationModeError(
f'{colorama.Fore.RED}Consolidation mode '
'is disabled, but there are still '
f'{len(nonterminal_jobs)} managed jobs '
'running. Please terminate those jobs '
f'first.{colorama.Style.RESET_ALL}')
logger.warning(
f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
f'but there are still {len(nonterminal_jobs)} managed jobs '
'running. Please terminate those jobs first.'
f'{colorama.Style.RESET_ALL}')
else:
logger.warning(
f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
Expand All @@ -233,14 +229,11 @@ def is_consolidation_mode(on_api_restart: bool = False) -> bool:
signal_file = pathlib.Path(
_JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE).expanduser()

restart_signal_file_exists = signal_file.exists()
consolidation_mode = (config_consolidation_mode and
restart_signal_file_exists)

if on_api_restart:
if config_consolidation_mode:
signal_file.touch()
else:
restart_signal_file_exists = signal_file.exists()
if not restart_signal_file_exists:
if config_consolidation_mode:
logger.warning(f'{colorama.Fore.YELLOW}Consolidation mode for '
Expand All @@ -259,8 +252,8 @@ def is_consolidation_mode(on_api_restart: bool = False) -> bool:
# have related config and will always seemingly disabled for consolidation
# mode. Check #6611 for more details.
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
_validate_consolidation_mode_config(consolidation_mode)
return consolidation_mode
_validate_consolidation_mode_config(config_consolidation_mode)
return config_consolidation_mode


def ha_recovery_for_consolidation_mode() -> None:
Expand Down
24 changes: 11 additions & 13 deletions sky/serve/serve_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,25 +218,23 @@ def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
if current_is_consolidation_mode:
controller_cn = controller.cluster_name
if global_user_state.cluster_with_name_exists(controller_cn):
with ux_utils.print_exception_no_traceback():
raise exceptions.InconsistentConsolidationModeError(
f'{colorama.Fore.RED}Consolidation mode for '
f'{controller.controller_type} is enabled, but the '
f'controller cluster {controller_cn} is still running. '
'Please terminate the controller cluster first.'
f'{colorama.Style.RESET_ALL}')
logger.warning(
f'{colorama.Fore.RED}Consolidation mode for '
f'{controller.controller_type} is enabled, but the controller '
f'cluster {controller_cn} is still running. Please terminate '
'the controller cluster first.'
f'{colorama.Style.RESET_ALL}')
else:
noun = 'pool' if pool else 'service'
all_services = [
svc for svc in serve_state.get_services() if svc['pool'] == pool
]
if all_services:
with ux_utils.print_exception_no_traceback():
raise exceptions.InconsistentConsolidationModeError(
f'{colorama.Fore.RED}Consolidation mode for '
f'{controller.controller_type} is disabled, but there are '
f'still {len(all_services)} {noun}s running. Please '
f'terminate those {noun}s first.{colorama.Style.RESET_ALL}')
logger.warning(
f'{colorama.Fore.RED}Consolidation mode for '
f'{controller.controller_type} is disabled, but there are '
f'still {len(all_services)} {noun}s running. Please terminate '
f'those {noun}s first.{colorama.Style.RESET_ALL}')


@annotations.lru_cache(scope='request', maxsize=1)
Expand Down
Loading