Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Azure] Support fractional A10 instance types #3877

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2663,6 +2663,24 @@ def check_resources_fit_cluster(
'stores, but the existing cluster with '
f'{launched_resources!r} does not support FUSE '
f'mounting. Launch a new cluster to run this task.')
if (example_resource.accelerators is not None and
launched_resources.accelerators is not None):
for acc in example_resource.accelerators:
if acc not in launched_resources.accelerators:
continue
self_count = example_resource.accelerators[acc]
existing_count = launched_resources.accelerators[acc]
if (isinstance(existing_count, float) and
not existing_count.is_integer() and
not math.isclose(self_count, existing_count)):
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesMismatchError(
'Task requested resources with fractional '
'accelerator counts. For fractional '
'counts, the required count must match the '
'existing cluster. Got required accelerator'
f' {acc}:{self_count} but the existing '
f'cluster has {acc}:{existing_count}.')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This error message is not accurate? Our check is for ACC count of existing cluster instead of the task requested resources?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please see the above comments 🤔

requested_resource_str = ', '.join(requested_resource_list)
if isinstance(task.resources, list):
requested_resource_str = f'[{requested_resource_str}]'
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
import enum
import fnmatch
import functools
import json
import os
import re
import subprocess
import time
import typing
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union

from sky import clouds
from sky import exceptions
Expand Down Expand Up @@ -366,7 +365,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='aws')

Expand Down Expand Up @@ -394,10 +393,8 @@ def make_deploy_resources_variables(
r = resources
# r.accelerators is cleared but .instance_type encodes the info.
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

if r.extract_docker_image() is not None:
image_id_to_use = None
Expand Down
8 changes: 4 additions & 4 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
"""Azure."""
import functools
import json
import os
import re
import subprocess
import textwrap
import typing
from typing import Any, Dict, Iterator, List, Optional, Tuple
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

import colorama

Expand Down Expand Up @@ -253,7 +252,7 @@ def zones_provision_loop(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the clouds that will not return float, we can avoid using Union, as mypy should support a inherited method that only uses a subset of the type.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried and this is what I got:

./format.sh
SkyPilot Black:
All done! ✨ 🍰 ✨
4 files left unchanged.
SkyPilot yapf: Done
SkyPilot isort:
Skipped 5 files
SkyPilot mypy:
sky/clouds/aws.py:365: error: Return type "Optional[Dict[str, int]]" of "get_accelerators_from_instance_type" incompatible with return type "Optional[Dict[str, Union[int, float]]]" in supertype "Cloud"  [override]
sky/clouds/aws.py:369: error: Incompatible return value type (got "Optional[Dict[str, Union[int, float]]]", expected "Optional[Dict[str, int]]")  [return-value]
sky/clouds/aws.py:397: error: Argument 1 to "make_ray_custom_resources_str" has incompatible type "Optional[Dict[str, int]]"; expected "Optional[Dict[str, Union[int, float]]]"  [arg-type]
Found 3 errors in 1 file (checked 225 source files)

return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='azure')

Expand Down Expand Up @@ -285,7 +284,8 @@ def make_deploy_resources_variables(
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
acc_count = None
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)
acc_count = str(sum(acc_dict.values()))
else:
custom_resources = None
Expand Down
18 changes: 11 additions & 7 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
"""
import collections
import enum
import math
import typing
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union

from sky import exceptions
from sky import skypilot_config
Expand Down Expand Up @@ -306,7 +307,7 @@ def get_vcpus_mem_from_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
"""Returns {acc: acc_count} held by 'instance_type', if any."""
raise NotImplementedError

Expand Down Expand Up @@ -644,8 +645,9 @@ def _check_instance_type_accelerators_combination(
assert resources.is_launchable(), resources

def _equal_accelerators(
acc_requested: Optional[Dict[str, int]],
acc_from_instance_type: Optional[Dict[str, int]]) -> bool:
acc_requested: Optional[Dict[str, Union[int, float]]],
acc_from_instance_type: Optional[Dict[str, Union[int,
float]]]) -> bool:
"""Check the requested accelerators equals to the instance type

Check the requested accelerators equals to the accelerators
Expand All @@ -660,12 +662,14 @@ def _equal_accelerators(
for acc in acc_requested:
if acc not in acc_from_instance_type:
return False
if acc_requested[acc] != acc_from_instance_type[acc]:
# Avoid float point precision issue.
if not math.isclose(acc_requested[acc],
acc_from_instance_type[acc]):
return False
return True

acc_from_instance_type = (cls.get_accelerators_from_instance_type(
resources.instance_type))
acc_from_instance_type = cls.get_accelerators_from_instance_type(
resources.instance_type)
if not _equal_accelerators(resources.accelerators,
acc_from_instance_type):
with ux_utils.print_exception_no_traceback():
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/cudo.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""Cudo Compute"""
import json
import subprocess
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

from sky import clouds
from sky.clouds import service_catalog
Expand Down Expand Up @@ -183,7 +182,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='cudo')

Expand All @@ -202,10 +201,8 @@ def make_deploy_resources_variables(
del zones, cluster_name # unused
r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

return {
'instance_type': resources.instance_type,
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/fluidstack.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""Fluidstack Cloud."""
import json
import os
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

import requests

Expand Down Expand Up @@ -155,7 +154,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='fluidstack')

Expand Down Expand Up @@ -184,10 +183,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

return {
'instance_type': resources.instance_type,
Expand Down
4 changes: 2 additions & 2 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import subprocess
import time
import typing
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union

import colorama

Expand Down Expand Up @@ -630,7 +630,7 @@ def _get_feasible_launchable_resources(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
# GCP handles accelerators separately from regular instance types,
# hence return none here.
return None
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/ibm.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""IBM Web Services."""
import json
import os
import typing
from typing import Any, Dict, Iterator, List, Optional, Tuple
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

import colorama

Expand Down Expand Up @@ -206,10 +205,8 @@ def _get_profile_resources(instance_profile):
'IBM does not currently support spot instances in this framework'

acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

instance_resources = _get_profile_resources(r.instance_type)

Expand Down Expand Up @@ -247,7 +244,7 @@ def get_vcpus_mem_from_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
"""Returns {acc: acc_count} held by 'instance_type', if any."""
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='ibm')
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
"""Kubernetes."""
import json
import os
import re
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

from sky import clouds
from sky import sky_logging
Expand Down Expand Up @@ -180,7 +179,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
inst = kubernetes_utils.KubernetesInstanceType.from_instance_type(
instance_type)
return {
Expand Down Expand Up @@ -234,10 +233,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

# resources.memory and cpus are None if they are not explicitly set.
# We fetch the default values for the instance type in that case.
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/lambda_cloud.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Lambda Cloud."""
import json
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

import requests

Expand Down Expand Up @@ -137,7 +136,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='lambda')

Expand Down Expand Up @@ -165,10 +164,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

return {
'instance_type': resources.instance_type,
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/oci.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@
- Hysun He (hysun.he@oracle.com) @ May 4, 2023: Support use the default
image_id (configurable) if no image_id specified in the task yaml.
"""
import json
import logging
import os
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

from sky import clouds
from sky import exceptions
Expand Down Expand Up @@ -178,7 +177,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='oci')

Expand All @@ -198,10 +197,8 @@ def make_deploy_resources_variables(

acc_dict = self.get_accelerators_from_instance_type(
resources.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

image_str = self._get_image_id(resources.image_id, region.name,
resources.instance_type)
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/paperspace.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
""" Paperspace Cloud. """

import json
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

import requests

Expand Down Expand Up @@ -162,7 +161,7 @@ def get_default_instance_type(

@classmethod
def get_accelerators_from_instance_type(
cls, instance_type: str) -> Optional[Dict[str, int]]:
cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='paperspace')

Expand All @@ -181,10 +180,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

return {
'instance_type': resources.instance_type,
Expand Down
Loading
Loading