Skip to content

Commit

Permalink
[GCP] Optimize gcp optimize speed with image_id specified (#2410)
Browse files Browse the repository at this point in the history
* Optimize the GCP optimization speed with image_id specified

* Add time test

* format

* Limit to 5s
  • Loading branch information
Michaelvll committed Aug 16, 2023
1 parent e58f33f commit b652b1f
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 8 deletions.
3 changes: 2 additions & 1 deletion sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,8 @@ def _get_image_id(
f'No image found for region {region_name}')
return image_id_str

def get_image_size(self, image_id: str, region: Optional[str]) -> float:
@classmethod
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
if image_id.startswith('skypilot:'):
return DEFAULT_AMI_GB
assert region is not None, (image_id, region)
Expand Down
3 changes: 2 additions & 1 deletion sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,8 @@ def get_credential_file_mounts(self) -> Dict[str, str]:
"""
raise NotImplementedError

def get_image_size(self, image_id: str, region: Optional[str]) -> float:
@classmethod
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
"""Check the image size from the cloud.
Returns: the image size in GB.
Expand Down
16 changes: 12 additions & 4 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,16 +347,17 @@ def _is_machine_image(cls, image_id: str) -> bool:
find_machine = re.match(r'projects/.*/.*/machineImages/.*', image_id)
return find_machine is not None

def get_image_size(self, image_id: str, region: Optional[str]) -> float:
del region # Unused.
@classmethod
@functools.lru_cache(maxsize=1)
def _get_image_size(cls, image_id: str) -> float:
if image_id.startswith('skypilot:'):
return DEFAULT_GCP_IMAGE_GB
try:
compute = gcp.build('compute',
'v1',
credentials=None,
cache_discovery=False)
except gcp.credential_error_exception() as e:
except gcp.credential_error_exception():
return DEFAULT_GCP_IMAGE_GB
try:
image_attrs = image_id.split('/')
Expand All @@ -368,7 +369,7 @@ def get_image_size(self, image_id: str, region: Optional[str]) -> float:
# of which are specified with the image_id field. We will
# distinguish them by checking if the image_id contains
# 'machineImages'.
if self._is_machine_image(image_id):
if cls._is_machine_image(image_id):
image_infos = compute.machineImages().get(
project=project, machineImage=image_name).execute()
# The VM launching in a different region than the machine
Expand All @@ -377,8 +378,10 @@ def get_image_size(self, image_id: str, region: Optional[str]) -> float:
return float(
image_infos['instanceProperties']['disks'][0]['diskSizeGb'])
else:
start = time.time()
image_infos = compute.images().get(project=project,
image=image_name).execute()
logger.debug(f'GCP image get took {time.time() - start:.2f}s')
return float(image_infos['diskSizeGb'])
except gcp.http_error_exception() as e:
if e.resp.status == 403:
Expand All @@ -391,6 +394,11 @@ def get_image_size(self, image_id: str, region: Optional[str]) -> float:
'GCP.') from None
raise

@classmethod
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
del region # Unused.
return cls._get_image_size(image_id)

@classmethod
def get_default_instance_type(
cls,
Expand Down
3 changes: 2 additions & 1 deletion sky/clouds/ibm.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,8 @@ def _get_image_objects():
and img['operating_system']['architecture'].startswith(
'amd')))['id']

def get_image_size(self, image_id: str, region: Optional[str]) -> float:
@classmethod
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
assert region is not None, (image_id, region)
client = ibm.client(region=region)
try:
Expand Down
3 changes: 2 additions & 1 deletion sky/clouds/oci.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,8 @@ def accelerator_in_region_or_zone(self,
return service_catalog.accelerator_in_region_or_zone(
accelerator, acc_count, region, zone, 'oci')

def get_image_size(self, image_id: str, region: Optional[str]) -> float:
@classmethod
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
# We ignore checking the image size because most of situations the
# boot volume size is larger than the image size. For specific rare
# situations, the configuration/setup commands should make sure the
Expand Down
27 changes: 27 additions & 0 deletions tests/test_optimizer_dryruns.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import tempfile
import textwrap
import time
from typing import Callable, List, Optional

import pytest
Expand Down Expand Up @@ -653,3 +654,29 @@ def test_invalid_accelerators_regions(enable_all_clouds, monkeypatch):
with pytest.raises(exceptions.ResourcesUnavailableError) as e:
sky.launch(task, cluster_name='should-fail', dryrun=True)
assert 'No launchable resource found for' in str(e.value), str(e.value)


def _test_optimize_speed(resources: sky.Resources):
with sky.Dag() as dag:
task = sky.Task(run='echo hi')
task.set_resources(resources)
start = time.time()
sky.optimize(dag)
end = time.time()
assert end - start < 5.0, (f'optimize took too long for {resources}, '
f'{end - start} seconds')


def test_optimize_speed(enable_all_clouds, monkeypatch):
_test_optimize_speed(sky.Resources(cpus=4))
for cloud in clouds.CLOUD_REGISTRY.values():
if cloud.is_same_cloud(sky.Local()):
continue
_test_optimize_speed(sky.Resources(cloud, cpus='4+'))
_test_optimize_speed(sky.Resources(cpus='4+', memory='4+'))
_test_optimize_speed(
sky.Resources(cpus='4+', memory='4+', accelerators='V100:1'))
_test_optimize_speed(
sky.Resources(cpus='4+', memory='4+', accelerators='A100-80GB:8'))
_test_optimize_speed(
sky.Resources(cpus='4+', memory='4+', accelerators='tpu-v3-32'))

0 comments on commit b652b1f

Please sign in to comment.