Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Azure] Support fractional A10 instance types #3877

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3411,6 +3411,20 @@ def _execute(
valid_resource = self.check_resources_fit_cluster(handle,
task,
check_ports=True)
# For fractional acc count clusters, we round up the number of accs to 1
# (see sky/utils/resources_utils.py::make_ray_custom_resources_str).
# Here we scale the required acc count to (required / launched) * 1 so
# the total number of accs is the same as the requested number.
launched_accs = handle.launched_resources.accelerators
if (launched_accs is not None and
valid_resource.accelerators is not None):
for _, count in launched_accs.items():
if isinstance(count, float) and not count.is_integer():
valid_resource = valid_resource.copy(
accelerators={
k: v / count
for k, v in valid_resource.accelerators.items()
})
task_copy = copy.copy(task)
# Handle multiple resources exec case.
task_copy.set_resources(valid_resource)
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
import enum
import fnmatch
import functools
import json
import os
import re
import subprocess
import time
import typing
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union

from sky import clouds
from sky import exceptions
Expand Down Expand Up @@ -366,7 +365,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='aws')

Expand Down Expand Up @@ -394,10 +393,8 @@ def make_deploy_resources_variables(
r = resources
# r.accelerators is cleared but .instance_type encodes the info.
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

if r.extract_docker_image() is not None:
image_id_to_use = None
Expand Down
10 changes: 4 additions & 6 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
"""Azure."""
import functools
import json
import os
import re
import subprocess
import textwrap
import typing
from typing import Any, Dict, Iterator, List, Optional, Tuple
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

import colorama

Expand Down Expand Up @@ -253,7 +252,7 @@ def zones_provision_loop(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the clouds that will not return float, we can avoid using Union, as mypy should support a inherited method that only uses a subset of the type.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried and this is what I got:

./format.sh
SkyPilot Black:
All done! ✨ 🍰 ✨
4 files left unchanged.
SkyPilot yapf: Done
SkyPilot isort:
Skipped 5 files
SkyPilot mypy:
sky/clouds/aws.py:365: error: Return type "Optional[Dict[str, int]]" of "get_accelerators_from_instance_type" incompatible with return type "Optional[Dict[str, Union[int, float]]]" in supertype "Cloud"  [override]
sky/clouds/aws.py:369: error: Incompatible return value type (got "Optional[Dict[str, Union[int, float]]]", expected "Optional[Dict[str, int]]")  [return-value]
sky/clouds/aws.py:397: error: Argument 1 to "make_ray_custom_resources_str" has incompatible type "Optional[Dict[str, int]]"; expected "Optional[Dict[str, Union[int, float]]]"  [arg-type]
Found 3 errors in 1 file (checked 225 source files)

return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='azure')

Expand Down Expand Up @@ -285,10 +284,9 @@ def make_deploy_resources_variables(
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
acc_count = None
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
acc_count = str(sum(acc_dict.values()))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

if (resources.image_id is None or
resources.extract_docker_image() is not None):
Expand Down
18 changes: 11 additions & 7 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
"""
import collections
import enum
import math
import typing
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union

from sky import exceptions
from sky import skypilot_config
Expand Down Expand Up @@ -306,7 +307,7 @@ def get_vcpus_mem_from_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
"""Returns {acc: acc_count} held by 'instance_type', if any."""
raise NotImplementedError

Expand Down Expand Up @@ -644,8 +645,9 @@ def _check_instance_type_accelerators_combination(
assert resources.is_launchable(), resources

def _equal_accelerators(
acc_requested: Optional[Dict[str, int]],
acc_from_instance_type: Optional[Dict[str, int]]) -> bool:
acc_requested: Optional[Dict[str, Union[int, float]]],
acc_from_instance_type: Optional[Dict[str, Union[int,
float]]]) -> bool:
"""Check the requested accelerators equals to the instance type

Check the requested accelerators equals to the accelerators
Expand All @@ -660,12 +662,14 @@ def _equal_accelerators(
for acc in acc_requested:
if acc not in acc_from_instance_type:
return False
if acc_requested[acc] != acc_from_instance_type[acc]:
# Avoid float point precision issue.
if not math.isclose(acc_requested[acc],
acc_from_instance_type[acc]):
return False
return True

acc_from_instance_type = (cls.get_accelerators_from_instance_type(
resources.instance_type))
acc_from_instance_type = cls.get_accelerators_from_instance_type(
resources.instance_type)
if not _equal_accelerators(resources.accelerators,
acc_from_instance_type):
with ux_utils.print_exception_no_traceback():
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/cudo.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""Cudo Compute"""
import json
import subprocess
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

from sky import clouds
from sky.clouds import service_catalog
Expand Down Expand Up @@ -183,7 +182,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='cudo')

Expand All @@ -202,10 +201,8 @@ def make_deploy_resources_variables(
del zones, cluster_name # unused
r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

return {
'instance_type': resources.instance_type,
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/fluidstack.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""Fluidstack Cloud."""
import json
import os
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

import requests

Expand Down Expand Up @@ -155,7 +154,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='fluidstack')

Expand Down Expand Up @@ -184,10 +183,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

return {
'instance_type': resources.instance_type,
Expand Down
4 changes: 2 additions & 2 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import subprocess
import time
import typing
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union

import colorama

Expand Down Expand Up @@ -630,7 +630,7 @@ def _get_feasible_launchable_resources(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
# GCP handles accelerators separately from regular instance types,
# hence return none here.
return None
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/ibm.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""IBM Web Services."""
import json
import os
import typing
from typing import Any, Dict, Iterator, List, Optional, Tuple
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

import colorama

Expand Down Expand Up @@ -206,10 +205,8 @@ def _get_profile_resources(instance_profile):
'IBM does not currently support spot instances in this framework'

acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

instance_resources = _get_profile_resources(r.instance_type)

Expand Down Expand Up @@ -247,7 +244,7 @@ def get_vcpus_mem_from_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
"""Returns {acc: acc_count} held by 'instance_type', if any."""
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='ibm')
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
"""Kubernetes."""
import json
import os
import re
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

from sky import clouds
from sky import sky_logging
Expand Down Expand Up @@ -180,7 +179,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
inst = kubernetes_utils.KubernetesInstanceType.from_instance_type(
instance_type)
return {
Expand Down Expand Up @@ -234,10 +233,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

# resources.memory and cpus are None if they are not explicitly set.
# We fetch the default values for the instance type in that case.
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/lambda_cloud.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Lambda Cloud."""
import json
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

import requests

Expand Down Expand Up @@ -137,7 +136,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='lambda')

Expand Down Expand Up @@ -165,10 +164,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

return {
'instance_type': resources.instance_type,
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/oci.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@
- Hysun He (hysun.he@oracle.com) @ May 4, 2023: Support use the default
image_id (configurable) if no image_id specified in the task yaml.
"""
import json
import logging
import os
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

from sky import clouds
from sky import exceptions
Expand Down Expand Up @@ -178,7 +177,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='oci')

Expand All @@ -198,10 +197,8 @@ def make_deploy_resources_variables(

acc_dict = self.get_accelerators_from_instance_type(
resources.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

image_str = self._get_image_id(resources.image_id, region.name,
resources.instance_type)
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/paperspace.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
""" Paperspace Cloud. """

import json
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

import requests

Expand Down Expand Up @@ -162,7 +161,7 @@ def get_default_instance_type(

@classmethod
def get_accelerators_from_instance_type(
cls, instance_type: str) -> Optional[Dict[str, int]]:
cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='paperspace')

Expand All @@ -181,10 +180,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

return {
'instance_type': resources.instance_type,
Expand Down
Loading
Loading