diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 223d257d1a6..c07938f1b00 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -223,7 +223,8 @@ def _get_image_id( f'No image found for region {region_name}') return image_id_str - def get_image_size(self, image_id: str, region: Optional[str]) -> float: + @classmethod + def get_image_size(cls, image_id: str, region: Optional[str]) -> float: if image_id.startswith('skypilot:'): return DEFAULT_AMI_GB assert region is not None, (image_id, region) diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index f5294f48236..46f440b63a7 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -386,7 +386,8 @@ def get_credential_file_mounts(self) -> Dict[str, str]: """ raise NotImplementedError - def get_image_size(self, image_id: str, region: Optional[str]) -> float: + @classmethod + def get_image_size(cls, image_id: str, region: Optional[str]) -> float: """Check the image size from the cloud. Returns: the image size in GB. diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index a4f0267d3cf..e81c9793107 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -347,8 +347,9 @@ def _is_machine_image(cls, image_id: str) -> bool: find_machine = re.match(r'projects/.*/.*/machineImages/.*', image_id) return find_machine is not None - def get_image_size(self, image_id: str, region: Optional[str]) -> float: - del region # Unused. + @classmethod + @functools.lru_cache(maxsize=1) + def _get_image_size(cls, image_id: str) -> float: if image_id.startswith('skypilot:'): return DEFAULT_GCP_IMAGE_GB try: @@ -356,7 +357,7 @@ def get_image_size(self, image_id: str, region: Optional[str]) -> float: 'v1', credentials=None, cache_discovery=False) - except gcp.credential_error_exception() as e: + except gcp.credential_error_exception(): return DEFAULT_GCP_IMAGE_GB try: image_attrs = image_id.split('/') @@ -368,7 +369,7 @@ def get_image_size(self, image_id: str, region: Optional[str]) -> float: # of which are specified with the image_id field. We will # distinguish them by checking if the image_id contains # 'machineImages'. - if self._is_machine_image(image_id): + if cls._is_machine_image(image_id): image_infos = compute.machineImages().get( project=project, machineImage=image_name).execute() # The VM launching in a different region than the machine @@ -377,8 +378,10 @@ def get_image_size(self, image_id: str, region: Optional[str]) -> float: return float( image_infos['instanceProperties']['disks'][0]['diskSizeGb']) else: + start = time.time() image_infos = compute.images().get(project=project, image=image_name).execute() + logger.debug(f'GCP image get took {time.time() - start:.2f}s') return float(image_infos['diskSizeGb']) except gcp.http_error_exception() as e: if e.resp.status == 403: @@ -391,6 +394,11 @@ def get_image_size(self, image_id: str, region: Optional[str]) -> float: 'GCP.') from None raise + @classmethod + def get_image_size(cls, image_id: str, region: Optional[str]) -> float: + del region # Unused. + return cls._get_image_size(image_id) + @classmethod def get_default_instance_type( cls, diff --git a/sky/clouds/ibm.py b/sky/clouds/ibm.py index 6234b757b59..6809c48f303 100644 --- a/sky/clouds/ibm.py +++ b/sky/clouds/ibm.py @@ -350,7 +350,8 @@ def _get_image_objects(): and img['operating_system']['architecture'].startswith( 'amd')))['id'] - def get_image_size(self, image_id: str, region: Optional[str]) -> float: + @classmethod + def get_image_size(cls, image_id: str, region: Optional[str]) -> float: assert region is not None, (image_id, region) client = ibm.client(region=region) try: diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index c5aa2c07898..3435c3ee80b 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -442,7 +442,8 @@ def accelerator_in_region_or_zone(self, return service_catalog.accelerator_in_region_or_zone( accelerator, acc_count, region, zone, 'oci') - def get_image_size(self, image_id: str, region: Optional[str]) -> float: + @classmethod + def get_image_size(cls, image_id: str, region: Optional[str]) -> float: # We ignore checking the image size because most of situations the # boot volume size is larger than the image size. For specific rare # situations, the configuration/setup commands should make sure the diff --git a/tests/test_optimizer_dryruns.py b/tests/test_optimizer_dryruns.py index a673ac5694e..1d3a97c0385 100644 --- a/tests/test_optimizer_dryruns.py +++ b/tests/test_optimizer_dryruns.py @@ -1,5 +1,6 @@ import tempfile import textwrap +import time from typing import Callable, List, Optional import pytest @@ -653,3 +654,29 @@ def test_invalid_accelerators_regions(enable_all_clouds, monkeypatch): with pytest.raises(exceptions.ResourcesUnavailableError) as e: sky.launch(task, cluster_name='should-fail', dryrun=True) assert 'No launchable resource found for' in str(e.value), str(e.value) + + +def _test_optimize_speed(resources: sky.Resources): + with sky.Dag() as dag: + task = sky.Task(run='echo hi') + task.set_resources(resources) + start = time.time() + sky.optimize(dag) + end = time.time() + assert end - start < 5.0, (f'optimize took too long for {resources}, ' + f'{end - start} seconds') + + +def test_optimize_speed(enable_all_clouds, monkeypatch): + _test_optimize_speed(sky.Resources(cpus=4)) + for cloud in clouds.CLOUD_REGISTRY.values(): + if cloud.is_same_cloud(sky.Local()): + continue + _test_optimize_speed(sky.Resources(cloud, cpus='4+')) + _test_optimize_speed(sky.Resources(cpus='4+', memory='4+')) + _test_optimize_speed( + sky.Resources(cpus='4+', memory='4+', accelerators='V100:1')) + _test_optimize_speed( + sky.Resources(cpus='4+', memory='4+', accelerators='A100-80GB:8')) + _test_optimize_speed( + sky.Resources(cpus='4+', memory='4+', accelerators='tpu-v3-32'))