Skip to content

Commit

Permalink
[UX] Better logging message for operators on the clusters terminated …
Browse files Browse the repository at this point in the history
…manually in the cloud (#2389)

* Show the cluster manually terminated for operators

* Fix message

* Better logging

* format

* Update sky/backends/backend_utils.py

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Fix multiple cluster case

---------

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>
  • Loading branch information
Michaelvll and concretevitamin committed Aug 30, 2023
1 parent c083f20 commit 59dc4b4
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 6 deletions.
31 changes: 26 additions & 5 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2366,10 +2366,15 @@ def check_cluster_available(
exceptions.CloudUserIdentityError: if we fail to get the current user
identity.
"""
record = global_user_state.get_cluster_from_name(cluster_name)
if dryrun:
record = global_user_state.get_cluster_from_name(cluster_name)
assert record is not None, cluster_name
return record['handle']

previous_cluster_status = None
if record is not None:
previous_cluster_status = record['status']

try:
cluster_status, handle = refresh_cluster_status_handle(cluster_name)
except exceptions.ClusterStatusFetchingError as e:
Expand All @@ -2388,7 +2393,6 @@ def check_cluster_available(
f'Failed to refresh the status for cluster {cluster_name!r}. It is '
f'not fatal, but {operation} might hang if the cluster is not up.\n'
f'Detailed reason: {e}')
record = global_user_state.get_cluster_from_name(cluster_name)
if record is None:
cluster_status, handle = None, None
else:
Expand All @@ -2397,10 +2401,27 @@ def check_cluster_available(
bright = colorama.Style.BRIGHT
reset = colorama.Style.RESET_ALL
if handle is None:
error_msg = (f'Cluster {cluster_name!r} not found on the cloud '
'provider.')
if previous_cluster_status is not None:
assert record is not None, previous_cluster_status
actions = []
if record['handle'].launched_resources.use_spot:
actions.append('preempted')
if record['autostop'] > 0 and record['to_down']:
actions.append('autodowned')
actions.append('manually terminated in console')
if len(actions) > 1:
actions[-1] = 'or ' + actions[-1]
actions_str = ', '.join(actions)
message = f' It was likely {actions_str}.'
if len(actions) > 1:
message = message.replace('likely', 'either')
error_msg += message

with ux_utils.print_exception_no_traceback():
raise ValueError(
f'{colorama.Fore.YELLOW}Cluster {cluster_name!r} does not '
f'exist.{reset}')
raise ValueError(f'{colorama.Fore.YELLOW}{error_msg}{reset}')
assert cluster_status is not None, 'handle is not None but status is None'
backend = get_backend_from_handle(handle)
if check_cloud_vm_ray_backend and not isinstance(
backend, backends.CloudVmRayBackend):
Expand Down
2 changes: 1 addition & 1 deletion sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1893,7 +1893,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
for cluster in clusters:
try:
job_table = core.queue(cluster, skip_finished, all_users)
except (RuntimeError, exceptions.NotSupportedError,
except (RuntimeError, ValueError, exceptions.NotSupportedError,
exceptions.ClusterNotUpError, exceptions.CloudUserIdentityError,
exceptions.ClusterOwnerIdentityMismatchError) as e:
if isinstance(e, exceptions.NotSupportedError):
Expand Down

0 comments on commit 59dc4b4

Please sign in to comment.