diff --git a/sky/authentication.py b/sky/authentication.py index 4a37cbd2373..0a780894d54 100644 --- a/sky/authentication.py +++ b/sky/authentication.py @@ -43,9 +43,9 @@ from sky.adaptors import ibm from sky.adaptors import kubernetes from sky.adaptors import runpod -from sky.clouds.utils import lambda_utils from sky.provision.fluidstack import fluidstack_utils from sky.provision.kubernetes import utils as kubernetes_utils +from sky.provision.lambda_cloud import lambda_utils from sky.utils import common_utils from sky.utils import kubernetes_enums from sky.utils import subprocess_utils diff --git a/sky/clouds/lambda_cloud.py b/sky/clouds/lambda_cloud.py index d3d20fbd41a..d2573ebbb29 100644 --- a/sky/clouds/lambda_cloud.py +++ b/sky/clouds/lambda_cloud.py @@ -8,7 +8,7 @@ from sky import clouds from sky import status_lib from sky.clouds import service_catalog -from sky.clouds.utils import lambda_utils +from sky.provision.lambda_cloud import lambda_utils from sky.utils import resources_utils if typing.TYPE_CHECKING: @@ -48,6 +48,9 @@ class Lambda(clouds.Cloud): clouds.CloudImplementationFeatures.HOST_CONTROLLERS: f'Host controllers are not supported in {_REPR}.', } + PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT + STATUS_VERSION = clouds.StatusVersion.SKYPILOT + @classmethod def _unsupported_features_for_resources( cls, resources: 'resources_lib.Resources' diff --git a/sky/provision/__init__.py b/sky/provision/__init__.py index 41d985ade41..bbe92b68c3a 100644 --- a/sky/provision/__init__.py +++ b/sky/provision/__init__.py @@ -19,6 +19,7 @@ from sky.provision import fluidstack from sky.provision import gcp from sky.provision import kubernetes +from sky.provision import lambda_cloud from sky.provision import runpod from sky.provision import vsphere from sky.utils import command_runner @@ -39,6 +40,8 @@ def _wrapper(*args, **kwargs): provider_name = kwargs.pop('provider_name') module_name = provider_name.lower() + if module_name == 'lambda': + module_name = 'lambda_cloud' module = globals().get(module_name) assert module is not None, f'Unknown provider: {module_name}' diff --git a/sky/provision/lambda_cloud/__init__.py b/sky/provision/lambda_cloud/__init__.py new file mode 100644 index 00000000000..63cf634ef09 --- /dev/null +++ b/sky/provision/lambda_cloud/__init__.py @@ -0,0 +1,12 @@ +"""Lambda provisioner for SkyPilot.""" + +from sky.provision.lambda_cloud.config import bootstrap_instances +from sky.provision.lambda_cloud.instance import cleanup_ports +from sky.provision.lambda_cloud.instance import get_cluster_info +from sky.provision.lambda_cloud.instance import open_ports +from sky.provision.lambda_cloud.instance import query_instances +from sky.provision.lambda_cloud.instance import query_ports +from sky.provision.lambda_cloud.instance import run_instances +from sky.provision.lambda_cloud.instance import stop_instances +from sky.provision.lambda_cloud.instance import terminate_instances +from sky.provision.lambda_cloud.instance import wait_instances diff --git a/sky/provision/lambda_cloud/config.py b/sky/provision/lambda_cloud/config.py new file mode 100644 index 00000000000..3066e7747fd --- /dev/null +++ b/sky/provision/lambda_cloud/config.py @@ -0,0 +1,10 @@ +"""Lambda Cloud configuration bootstrapping""" + +from sky.provision import common + + +def bootstrap_instances( + region: str, cluster_name: str, + config: common.ProvisionConfig) -> common.ProvisionConfig: + del region, cluster_name # unused + return config diff --git a/sky/provision/lambda_cloud/instance.py b/sky/provision/lambda_cloud/instance.py new file mode 100644 index 00000000000..d657acc738b --- /dev/null +++ b/sky/provision/lambda_cloud/instance.py @@ -0,0 +1,277 @@ +"""Lambda instance provisioning.""" + +import time +from typing import Any, Dict, List, Optional + +from sky import authentication as auth +from sky import sky_logging +from sky import status_lib +from sky.provision import common +import sky.provision.lambda_cloud.lambda_utils as lambda_utils +from sky.utils import common_utils +from sky.utils import ux_utils + +POLL_INTERVAL = 1 + +logger = sky_logging.init_logger(__name__) +_lambda_client = None + + +def _get_lambda_client(): + global _lambda_client + if _lambda_client is None: + _lambda_client = lambda_utils.LambdaCloudClient() + return _lambda_client + + +def _filter_instances(cluster_name_on_cloud: str, + status_filters: Optional[List[str]]) -> Dict[str, Any]: + lambda_client = _get_lambda_client() + instances = lambda_client.list_instances() + possible_names = [ + f'{cluster_name_on_cloud}-head', + f'{cluster_name_on_cloud}-worker', + ] + + filtered_instances = {} + for instance in instances: + if (status_filters is not None and + instance['status'] not in status_filters): + continue + if instance.get('name') in possible_names: + filtered_instances[instance['id']] = instance + return filtered_instances + + +def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]: + head_instance_id = None + for instance_id, instance in instances.items(): + if instance['name'].endswith('-head'): + head_instance_id = instance_id + break + return head_instance_id + + +def _get_ssh_key_name(prefix: str = '') -> str: + lambda_client = _get_lambda_client() + _, public_key_path = auth.get_or_generate_keys() + with open(public_key_path, 'r', encoding='utf-8') as f: + public_key = f.read() + name, exists = lambda_client.get_unique_ssh_key_name(prefix, public_key) + if not exists: + raise lambda_utils.LambdaCloudError('SSH key not found') + return name + + +def run_instances(region: str, cluster_name_on_cloud: str, + config: common.ProvisionConfig) -> common.ProvisionRecord: + """Runs instances for the given cluster""" + lambda_client = _get_lambda_client() + pending_status = ['booting'] + while True: + instances = _filter_instances(cluster_name_on_cloud, pending_status) + if not instances: + break + logger.info(f'Waiting for {len(instances)} instances to be ready.') + time.sleep(POLL_INTERVAL) + exist_instances = _filter_instances(cluster_name_on_cloud, ['active']) + head_instance_id = _get_head_instance_id(exist_instances) + + to_start_count = config.count - len(exist_instances) + if to_start_count < 0: + raise RuntimeError( + f'Cluster {cluster_name_on_cloud} already has ' + f'{len(exist_instances)} nodes, but {config.count} are required.') + if to_start_count == 0: + if head_instance_id is None: + raise RuntimeError( + f'Cluster {cluster_name_on_cloud} has no head node.') + logger.info(f'Cluster {cluster_name_on_cloud} already has ' + f'{len(exist_instances)} nodes, no need to start more.') + return common.ProvisionRecord( + provider_name='lambda', + cluster_name=cluster_name_on_cloud, + region=region, + zone=None, + head_instance_id=head_instance_id, + resumed_instance_ids=[], + created_instance_ids=[], + ) + + created_instance_ids = [] + ssh_key_name = _get_ssh_key_name() + + def launch_nodes(node_type: str, quantity: int): + try: + instance_ids = lambda_client.create_instances( + instance_type=config.node_config['InstanceType'], + region=region, + name=f'{cluster_name_on_cloud}-{node_type}', + quantity=quantity, + ssh_key_name=ssh_key_name, + ) + logger.info(f'Launched {len(instance_ids)} {node_type} node(s), ' + f'instance_ids: {instance_ids}') + return instance_ids + except Exception as e: + logger.warning(f'run_instances error: {e}') + raise + + if head_instance_id is None: + instance_ids = launch_nodes('head', 1) + if len(instance_ids) != 1: + raise RuntimeError( + f'Expected exactly one instance, got {len(instance_ids)}') + created_instance_ids.append(instance_ids[0]) + head_instance_id = instance_ids[0] + + assert head_instance_id is not None, 'head_instance_id should not be None' + + worker_node_count = to_start_count - 1 + if worker_node_count > 0: + instance_ids = launch_nodes('worker', worker_node_count) + created_instance_ids.extend(instance_ids) + + while True: + instances = _filter_instances(cluster_name_on_cloud, ['active']) + if len(instances) == config.count: + break + + time.sleep(POLL_INTERVAL) + + return common.ProvisionRecord( + provider_name='lambda', + cluster_name=cluster_name_on_cloud, + region=region, + zone=None, + head_instance_id=head_instance_id, + resumed_instance_ids=[], + created_instance_ids=created_instance_ids, + ) + + +def wait_instances(region: str, cluster_name_on_cloud: str, + state: Optional[status_lib.ClusterStatus]) -> None: + del region, cluster_name_on_cloud, state # Unused. + + +def stop_instances( + cluster_name_on_cloud: str, + provider_config: Optional[Dict[str, Any]] = None, + worker_only: bool = False, +) -> None: + raise NotImplementedError( + 'stop_instances is not supported for Lambda Cloud') + + +def terminate_instances( + cluster_name_on_cloud: str, + provider_config: Optional[Dict[str, Any]] = None, + worker_only: bool = False, +) -> None: + """See sky/provision/__init__.py""" + del provider_config + lambda_client = _get_lambda_client() + instances = _filter_instances(cluster_name_on_cloud, None) + + instance_ids_to_terminate = [] + for instance_id, instance in instances.items(): + if worker_only and not instance['name'].endswith('-worker'): + continue + instance_ids_to_terminate.append(instance_id) + + try: + logger.debug( + f'Terminating instances {", ".join(instance_ids_to_terminate)}') + lambda_client.remove_instances(*instance_ids_to_terminate) + except Exception as e: # pylint: disable=broad-except + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + f'Failed to terminate instances {instance_ids_to_terminate}: ' + f'{common_utils.format_exception(e, use_bracket=False)}') from e + + +def get_cluster_info( + region: str, + cluster_name_on_cloud: str, + provider_config: Optional[Dict[str, Any]] = None, +) -> common.ClusterInfo: + del region # unused + running_instances = _filter_instances(cluster_name_on_cloud, ['active']) + instances: Dict[str, List[common.InstanceInfo]] = {} + head_instance_id = None + for instance_id, instance_info in running_instances.items(): + instances[instance_id] = [ + common.InstanceInfo( + instance_id=instance_id, + internal_ip=instance_info['private_ip'], + external_ip=instance_info['ip'], + ssh_port=22, + tags={}, + ) + ] + if instance_info['name'].endswith('-head'): + head_instance_id = instance_id + + return common.ClusterInfo( + instances=instances, + head_instance_id=head_instance_id, + provider_name='lambda', + provider_config=provider_config, + custom_ray_options={ + 'use_external_ip': True, + }, + ) + + +def query_instances( + cluster_name_on_cloud: str, + provider_config: Optional[Dict[str, Any]] = None, + non_terminated_only: bool = True, +) -> Dict[str, Optional[status_lib.ClusterStatus]]: + """See sky/provision/__init__.py""" + assert provider_config is not None, (cluster_name_on_cloud, provider_config) + instances = _filter_instances(cluster_name_on_cloud, None) + + status_map = { + 'booting': status_lib.ClusterStatus.INIT, + 'active': status_lib.ClusterStatus.UP, + 'unhealthy': status_lib.ClusterStatus.INIT, + 'terminating': status_lib.ClusterStatus.STOPPED, + 'terminated': status_lib.ClusterStatus.STOPPED, + } + statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {} + for instance_id, instance in instances.items(): + status = status_map.get(instance['status']) + if non_terminated_only and status is None: + continue + statuses[instance_id] = status + return statuses + + +def open_ports( + cluster_name_on_cloud: str, + ports: List[str], + provider_config: Optional[Dict[str, Any]] = None, +) -> None: + raise NotImplementedError() + + +def cleanup_ports( + cluster_name_on_cloud: str, + ports: List[str], + provider_config: Optional[Dict[str, Any]] = None, +) -> None: + """See sky/provision/__init__.py""" + del cluster_name_on_cloud, ports, provider_config # Unused. + + +def query_ports( + cluster_name_on_cloud: str, + ports: List[str], + head_ip: Optional[str] = None, + provider_config: Optional[Dict[str, Any]] = None, +) -> Dict[int, List[common.Endpoint]]: + """See sky/provision/__init__.py""" + del cluster_name_on_cloud, provider_config # Unused. + return common.query_ports_passthrough(ports, head_ip) diff --git a/sky/clouds/utils/lambda_utils.py b/sky/provision/lambda_cloud/lambda_utils.py similarity index 92% rename from sky/clouds/utils/lambda_utils.py rename to sky/provision/lambda_cloud/lambda_utils.py index 61c4b33ebe9..99854a45da4 100644 --- a/sky/clouds/utils/lambda_utils.py +++ b/sky/provision/lambda_cloud/lambda_utils.py @@ -1,4 +1,5 @@ """Lambda Cloud helper functions.""" + import json import os import time @@ -76,7 +77,7 @@ def refresh(self, instance_ids: List[str]) -> None: def raise_lambda_error(response: requests.Response) -> None: - """Raise LambdaCloudError if appropriate. """ + """Raise LambdaCloudError if appropriate.""" status_code = response.status_code if status_code == 200: return @@ -131,20 +132,22 @@ def __init__(self) -> None: self.api_key = self._credentials['api_key'] self.headers = {'Authorization': f'Bearer {self.api_key}'} - def create_instances(self, - instance_type: str = 'gpu_1x_a100_sxm4', - region: str = 'us-east-1', - quantity: int = 1, - name: str = '', - ssh_key_name: str = '') -> List[str]: + def create_instances( + self, + instance_type: str = 'gpu_1x_a100_sxm4', + region: str = 'us-east-1', + quantity: int = 1, + name: str = '', + ssh_key_name: str = '', + ) -> List[str]: """Launch new instances.""" # Optimization: # Most API requests are rate limited at ~1 request every second but # launch requests are rate limited at ~1 request every 10 seconds. # So don't use launch requests to check availability. # See https://docs.lambdalabs.com/cloud/rate-limiting/ for more. - available_regions = self.list_catalog()[instance_type]\ - ['regions_with_capacity_available'] + available_regions = (self.list_catalog()[instance_type] + ['regions_with_capacity_available']) available_regions = [reg['name'] for reg in available_regions] if region not in available_regions: if len(available_regions) > 0: @@ -163,27 +166,25 @@ def create_instances(self, 'instance_type_name': instance_type, 'ssh_key_names': [ssh_key_name], 'quantity': quantity, - 'name': name + 'name': name, }) response = _try_request_with_backoff( 'post', f'{API_ENDPOINT}/instance-operations/launch', data=data, - headers=self.headers) + headers=self.headers, + ) return response.json().get('data', []).get('instance_ids', []) def remove_instances(self, *instance_ids: str) -> Dict[str, Any]: """Terminate instances.""" - data = json.dumps({ - 'instance_ids': [ - instance_ids[0] # TODO(ewzeng) don't hardcode - ] - }) + data = json.dumps({'instance_ids': list(instance_ids)}) response = _try_request_with_backoff( 'post', f'{API_ENDPOINT}/instance-operations/terminate', data=data, - headers=self.headers) + headers=self.headers, + ) return response.json().get('data', []).get('terminated_instances', []) def list_instances(self) -> List[Dict[str, Any]]: diff --git a/sky/setup_files/MANIFEST.in b/sky/setup_files/MANIFEST.in index 54ab3b55a32..0cd93f485e0 100644 --- a/sky/setup_files/MANIFEST.in +++ b/sky/setup_files/MANIFEST.in @@ -6,7 +6,6 @@ include sky/setup_files/* include sky/skylet/*.sh include sky/skylet/LICENSE include sky/skylet/providers/ibm/* -include sky/skylet/providers/lambda_cloud/* include sky/skylet/providers/oci/* include sky/skylet/providers/scp/* include sky/skylet/providers/*.py diff --git a/sky/skylet/providers/lambda_cloud/__init__.py b/sky/skylet/providers/lambda_cloud/__init__.py deleted file mode 100644 index 64dac295eb5..00000000000 --- a/sky/skylet/providers/lambda_cloud/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"""Lambda Cloud node provider""" -from sky.skylet.providers.lambda_cloud.node_provider import LambdaNodeProvider diff --git a/sky/skylet/providers/lambda_cloud/node_provider.py b/sky/skylet/providers/lambda_cloud/node_provider.py deleted file mode 100644 index bb8d40da62e..00000000000 --- a/sky/skylet/providers/lambda_cloud/node_provider.py +++ /dev/null @@ -1,320 +0,0 @@ -import logging -import os -from threading import RLock -import time -from typing import Any, Dict, List, Optional - -from ray.autoscaler.node_provider import NodeProvider -from ray.autoscaler.tags import NODE_KIND_HEAD -from ray.autoscaler.tags import NODE_KIND_WORKER -from ray.autoscaler.tags import STATUS_UP_TO_DATE -from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME -from ray.autoscaler.tags import TAG_RAY_NODE_KIND -from ray.autoscaler.tags import TAG_RAY_NODE_NAME -from ray.autoscaler.tags import TAG_RAY_NODE_STATUS -from ray.autoscaler.tags import TAG_RAY_USER_NODE_TYPE - -from sky import authentication as auth -from sky.clouds.utils import lambda_utils -from sky.utils import command_runner -from sky.utils import common_utils -from sky.utils import subprocess_utils -from sky.utils import ux_utils - -_TAG_PATH_PREFIX = '~/.sky/generated/lambda_cloud/metadata' -_REMOTE_SSH_KEY_NAME = '~/.lambda_cloud/ssh_key_name' -_REMOTE_RAY_SSH_KEY = '~/ray_bootstrap_key.pem' -_REMOTE_RAY_YAML = '~/ray_bootstrap_config.yaml' -_GET_INTERNAL_IP_CMD = 'ip -4 -br addr show | grep UP | grep -Eo "(10\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|172\.(1[6-9]|2[0-9]|3[0-1]))\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"' - -logger = logging.getLogger(__name__) - - -def synchronized(f): - - def wrapper(self, *args, **kwargs): - self.lock.acquire() - try: - return f(self, *args, **kwargs) - finally: - self.lock.release() - - return wrapper - - -class LambdaNodeProvider(NodeProvider): - """Node Provider for Lambda Cloud. - - This provider assumes Lambda Cloud credentials are set. - """ - - def __init__(self, provider_config: Dict[str, Any], - cluster_name: str) -> None: - NodeProvider.__init__(self, provider_config, cluster_name) - self.lock = RLock() - self.lambda_client = lambda_utils.LambdaCloudClient() - self.cached_nodes: Dict[str, Dict[str, Any]] = {} - self.metadata = lambda_utils.Metadata(_TAG_PATH_PREFIX, cluster_name) - self.ssh_key_path = os.path.expanduser(auth.PRIVATE_SSH_KEY_PATH) - - def _get_ssh_key_name(prefix: str) -> str: - public_key_path = os.path.expanduser(auth.PUBLIC_SSH_KEY_PATH) - with open(public_key_path, 'r') as f: - public_key = f.read() - name, exists = self.lambda_client.get_unique_ssh_key_name( - prefix, public_key) - if not exists: - raise lambda_utils.LambdaCloudError('SSH key not found') - return name - - ray_yaml_path = os.path.expanduser(_REMOTE_RAY_YAML) - self.on_head = (os.path.exists(ray_yaml_path) and - common_utils.read_yaml(ray_yaml_path)['cluster_name'] - == cluster_name) - - if self.on_head: - self.ssh_key_path = os.path.expanduser(_REMOTE_RAY_SSH_KEY) - ssh_key_name_path = os.path.expanduser(_REMOTE_SSH_KEY_NAME) - if os.path.exists(ssh_key_name_path): - with open(ssh_key_name_path, 'r') as f: - self.ssh_key_name = f.read() - else: - # At this point, `~/.ssh/sky-key.pub` contains the public - # key used to launch this cluster. Use it to determine - # ssh key name and store the name in _REMOTE_SSH_KEY_NAME. - # Note: this case only runs during cluster launch, so it is - # not possible for ~/.ssh/sky-key.pub to already be regenerated - # by the user. - self.ssh_key_name = _get_ssh_key_name('') - with open(ssh_key_name_path, 'w', encoding='utf-8') as f: - f.write(self.ssh_key_name) - else: - # On local - self.ssh_key_name = _get_ssh_key_name( - f'sky-key-{common_utils.get_user_hash()}') - - def _guess_and_add_missing_tags(self, vms: List[Dict[str, Any]]) -> None: - """Adds missing vms to local tag file and guesses their tags.""" - for node in vms: - if self.metadata.get(node['id']) is not None: - pass - elif node['name'] == f'{self.cluster_name}-head': - self.metadata.set( - node['id'], { - 'tags': { - TAG_RAY_CLUSTER_NAME: self.cluster_name, - TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, - TAG_RAY_NODE_KIND: NODE_KIND_HEAD, - TAG_RAY_USER_NODE_TYPE: 'ray_head_default', - TAG_RAY_NODE_NAME: f'ray-{self.cluster_name}-head', - } - }) - elif node['name'] == f'{self.cluster_name}-worker': - self.metadata.set( - node['id'], { - 'tags': { - TAG_RAY_CLUSTER_NAME: self.cluster_name, - TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, - TAG_RAY_NODE_KIND: NODE_KIND_WORKER, - TAG_RAY_USER_NODE_TYPE: 'ray_worker_default', - TAG_RAY_NODE_NAME: f'ray-{self.cluster_name}-worker', - } - }) - - def _list_instances_in_cluster(self) -> List[Dict[str, Any]]: - """List running instances in cluster.""" - vms = self.lambda_client.list_instances() - possible_names = [ - f'{self.cluster_name}-head', f'{self.cluster_name}-worker' - ] - return [node for node in vms if node.get('name') in possible_names] - - @synchronized - def _get_filtered_nodes(self, tag_filters: Dict[str, - str]) -> Dict[str, Any]: - - def _extract_metadata(vm: Dict[str, Any]) -> Dict[str, Any]: - metadata = {'id': vm['id'], 'status': vm['status'], 'tags': {}} - instance_info = self.metadata.get(vm['id']) - if instance_info is not None: - metadata['tags'] = instance_info['tags'] - metadata['external_ip'] = vm.get('ip') - return metadata - - def _match_tags(vm: Dict[str, Any]): - vm_info = self.metadata.get(vm['id']) - tags = {} if vm_info is None else vm_info['tags'] - for k, v in tag_filters.items(): - if tags.get(k) != v: - return False - return True - - def _get_internal_ip(node: Dict[str, Any]): - # TODO(ewzeng): cache internal ips in metadata file to reduce - # ssh overhead. - if node['external_ip'] is None or node['status'] != 'active': - node['internal_ip'] = None - return - runner = command_runner.SSHCommandRunner( - node=(node['external_ip'], 22), - ssh_user='ubuntu', - ssh_private_key=self.ssh_key_path) - rc, stdout, stderr = runner.run(_GET_INTERNAL_IP_CMD, - require_outputs=True, - stream_logs=False) - subprocess_utils.handle_returncode( - rc, - _GET_INTERNAL_IP_CMD, - 'Failed get obtain private IP from node', - stderr=stdout + stderr) - node['internal_ip'] = stdout.strip() - - vms = self._list_instances_in_cluster() - self.metadata.refresh([node['id'] for node in vms]) - self._guess_and_add_missing_tags(vms) - nodes = [_extract_metadata(vm) for vm in filter(_match_tags, vms)] - nodes = [ - node for node in nodes - if node['status'] not in ['terminating', 'terminated'] - ] - subprocess_utils.run_in_parallel(_get_internal_ip, nodes) - self.cached_nodes = {node['id']: node for node in nodes} - return self.cached_nodes - - def non_terminated_nodes(self, tag_filters: Dict[str, str]) -> List[str]: - """Return a list of node ids filtered by the specified tags dict. - - This list must not include terminated nodes. For performance reasons, - providers are allowed to cache the result of a call to - non_terminated_nodes() to serve single-node queries - (e.g. is_running(node_id)). This means that non_terminated_nodes() must - be called again to refresh results. - - Examples: - >>> provider.non_terminated_nodes({TAG_RAY_NODE_KIND: "worker"}) - ["node-1", "node-2"] - """ - nodes = self._get_filtered_nodes(tag_filters=tag_filters) - return [k for k, _ in nodes.items()] - - def is_running(self, node_id: str) -> bool: - """Return whether the specified node is running.""" - return self._get_cached_node(node_id=node_id) is not None - - def is_terminated(self, node_id: str) -> bool: - """Return whether the specified node is terminated.""" - return self._get_cached_node(node_id=node_id) is None - - def node_tags(self, node_id: str) -> Dict[str, str]: - """Returns the tags of the given node (string dict).""" - node = self._get_cached_node(node_id=node_id) - if node is None: - return {} - return node['tags'] - - def external_ip(self, node_id: str) -> Optional[str]: - """Returns the external ip of the given node.""" - node = self._get_cached_node(node_id=node_id) - if node is None: - return None - ip = node.get('external_ip') - with ux_utils.print_exception_no_traceback(): - if ip is None: - raise lambda_utils.LambdaCloudError( - 'A node ip address was not found. Either ' - '(1) Lambda Cloud has internally errored, or ' - '(2) the cluster is still booting. ' - 'You can manually terminate the cluster on the ' - 'Lambda Cloud console or (in case 2) wait for ' - 'booting to finish (~2 minutes).') - return ip - - def internal_ip(self, node_id: str) -> Optional[str]: - """Returns the internal ip (Ray ip) of the given node.""" - node = self._get_cached_node(node_id=node_id) - if node is None: - return None - ip = node.get('internal_ip') - with ux_utils.print_exception_no_traceback(): - if ip is None: - raise lambda_utils.LambdaCloudError( - 'A node ip address was not found. Either ' - '(1) Lambda Cloud has internally errored, or ' - '(2) the cluster is still booting. ' - 'You can manually terminate the cluster on the ' - 'Lambda Cloud console or (in case 2) wait for ' - 'booting to finish (~2 minutes).') - return ip - - def create_node(self, node_config: Dict[str, Any], tags: Dict[str, str], - count: int) -> None: - """Creates a number of nodes within the namespace.""" - # Get tags - config_tags = node_config.get('tags', {}).copy() - config_tags.update(tags) - config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name - - # Create nodes - instance_type = node_config['InstanceType'] - region = self.provider_config['region'] - - if config_tags[TAG_RAY_NODE_KIND] == NODE_KIND_HEAD: - name = f'{self.cluster_name}-head' - # Occasionally, the head node will continue running for a short - # period after termination. This can lead to the following bug: - # 1. Head node autodowns but continues running. - # 2. The next autodown event is triggered, which executes ray up. - # 3. Head node stops running. - # In this case, a new head node is created after the cluster has - # terminated. We avoid this with the following check: - if self.on_head: - raise lambda_utils.LambdaCloudError('Head already exists.') - else: - name = f'{self.cluster_name}-worker' - - # Lambda launch api only supports launching one node at a time, - # so we do a loop. Remove loop when launch api allows quantity > 1 - booting_list = [] - for _ in range(count): - vm_id = self.lambda_client.create_instances( - instance_type=instance_type, - region=region, - quantity=1, - name=name, - ssh_key_name=self.ssh_key_name)[0] - self.metadata.set(vm_id, {'tags': config_tags}) - booting_list.append(vm_id) - time.sleep(10) # Avoid api rate limits - - # Wait for nodes to finish booting - while True: - vms = self._list_instances_in_cluster() - for vm_id in booting_list.copy(): - for vm in vms: - if vm['id'] == vm_id and vm['status'] == 'active': - booting_list.remove(vm_id) - if len(booting_list) == 0: - return - time.sleep(10) - - @synchronized - def set_node_tags(self, node_id: str, tags: Dict[str, str]) -> None: - """Sets the tag values (string dict) for the specified node.""" - node = self._get_node(node_id) - assert node is not None, node_id - node['tags'].update(tags) - self.metadata.set(node_id, {'tags': node['tags']}) - - def terminate_node(self, node_id: str) -> None: - """Terminates the specified node.""" - self.lambda_client.remove_instances(node_id) - self.metadata.set(node_id, None) - - def _get_node(self, node_id: str) -> Optional[Dict[str, Any]]: - self._get_filtered_nodes({}) # Side effect: updates cache - return self.cached_nodes.get(node_id, None) - - def _get_cached_node(self, node_id: str) -> Optional[Dict[str, Any]]: - if node_id in self.cached_nodes: - return self.cached_nodes[node_id] - return self._get_node(node_id=node_id) diff --git a/sky/templates/lambda-ray.yml.j2 b/sky/templates/lambda-ray.yml.j2 index 6b6d94cfb3c..c4b8dba1a9f 100644 --- a/sky/templates/lambda-ray.yml.j2 +++ b/sky/templates/lambda-ray.yml.j2 @@ -7,7 +7,7 @@ idle_timeout_minutes: 60 provider: type: external - module: sky.skylet.providers.lambda_cloud.LambdaNodeProvider + module: sky.provision.lambda region: {{region}} # Disable launch config check for worker nodes as it can cause resource # leakage. @@ -25,14 +25,6 @@ available_node_types: resources: {} node_config: InstanceType: {{instance_type}} -{% if num_nodes > 1 %} - ray_worker_default: - min_workers: {{num_nodes - 1}} - max_workers: {{num_nodes - 1}} - resources: {} - node_config: - InstanceType: {{instance_type}} -{%- endif %} head_node_type: ray_head_default @@ -64,7 +56,10 @@ setup_commands: # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - sudo systemctl stop unattended-upgrades || true; + - {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} + sudo systemctl stop unattended-upgrades || true; sudo systemctl disable unattended-upgrades || true; sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true; @@ -81,31 +76,5 @@ setup_commands: mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); -# Command to start ray on the head node. You don't need to change this. -# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH -# connection, which is expensive. Try your best to co-locate commands into fewer -# items! The same comment applies for worker_start_ray_commands. -# -# Increment the following for catching performance bugs easier: -# current num items (num SSH connections): 2 -head_start_ray_commands: - - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --min-worker-port 11002 --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; - which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; - {{dump_port_command}}; {{ray_head_wait_initialized_command}} - -{%- if num_nodes > 1 %} -worker_start_ray_commands: - - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --min-worker-port 11002 --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; - which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; -{%- else %} -worker_start_ray_commands: [] -{%- endif %} - -head_node: {} -worker_nodes: {} - -# These fields are required for external cloud providers. -head_setup_commands: [] -worker_setup_commands: [] -cluster_synced_files: [] -file_mounts_sync_continuously: False +# Command to start ray clusters are now placed in `sky.provision.instance_setup`. +# We do not need to list it here anymore.