Skip to content

Commit

Permalink
fix tpu bug (#2350)
Browse files Browse the repository at this point in the history
* fix tpu bug

* format

* update
  • Loading branch information
infwinston committed Aug 4, 2023
1 parent ca2a092 commit e9be60a
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 4 deletions.
12 changes: 11 additions & 1 deletion sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2248,7 +2248,7 @@ def _update_stable_ssh_ports(self, max_attempts: int = 1) -> None:
ports = [head_port] + worker_ports
else:
# Use port 22 for other clouds
ports = [22] * self.launched_nodes
ports = [22] * self.num_node_ips
self.stable_ssh_ports = ports

def _update_stable_cluster_ips(self, max_attempts: int = 1) -> None:
Expand Down Expand Up @@ -2350,6 +2350,16 @@ def head_ssh_port(self):
return external_ssh_ports[0]
return None

@property
def num_node_ips(self) -> int:
"""Returns number of IPs of the cluster, correctly handling TPU Pod."""
is_tpu_vm_pod = tpu_utils.is_tpu_vm_pod(self.launched_resources)
if is_tpu_vm_pod:
num_ips = tpu_utils.get_num_tpu_devices(self.launched_resources)
else:
num_ips = self.launched_nodes
return num_ips

def __setstate__(self, state):
self._version = self._VERSION

Expand Down
5 changes: 2 additions & 3 deletions sky/utils/tpu_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,9 @@ def is_tpu_vm_pod(resources: Optional[resources_lib.Resources]) -> bool:
return acc not in ['tpu-v2-8', 'tpu-v3-8', 'tpu-v4-8']


def get_num_tpu_devices(
resources: Optional[resources_lib.Resources]) -> Optional[int]:
def get_num_tpu_devices(resources: Optional[resources_lib.Resources]) -> int:
if resources is None or not is_tpu(resources):
return None
raise ValueError('resources must be a valid TPU resource.')
acc, _ = list(resources.accelerators.items())[0]
num_tpu_devices = int(int(acc.split('-')[2]) / 8)
return num_tpu_devices
Expand Down

0 comments on commit e9be60a

Please sign in to comment.