diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index 1768cd6091e..8e2b2340271 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -58,12 +58,11 @@ class Azure(clouds.Cloud): # names, so the limit is 64 - 4 - 7 - 10 = 43. # Reference: https://azure.github.io/PSRule.Rules.Azure/en/rules/Azure.ResourceGroup.Name/ # pylint: disable=line-too-long _MAX_CLUSTER_NAME_LEN_LIMIT = 42 - _BEST_DISK_TIER = resources_utils.DiskTier.MEDIUM + _BEST_DISK_TIER = resources_utils.DiskTier.HIGH _DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM # Azure does not support high disk and ultra disk tier. - _SUPPORTED_DISK_TIERS = ( - set(resources_utils.DiskTier) - - {resources_utils.DiskTier.HIGH, resources_utils.DiskTier.ULTRA}) + _SUPPORTED_DISK_TIERS = (set(resources_utils.DiskTier) - + {resources_utils.DiskTier.ULTRA}) _INDENT_PREFIX = ' ' * 4 @@ -361,7 +360,9 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]: start_index += 1 assert False, 'Low disk tier should always be supported on Azure.' - return { + disk_tier = _failover_disk_tier() + + resources_vars = { 'instance_type': r.instance_type, 'custom_resources': custom_resources, 'num_gpus': acc_count, @@ -371,12 +372,18 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]: 'zones': None, **image_config, 'need_nvidia_driver_extension': need_nvidia_driver_extension, - 'disk_tier': Azure._get_disk_type(_failover_disk_tier()), + 'disk_tier': Azure._get_disk_type(disk_tier), 'cloud_init_setup_commands': cloud_init_setup_commands, 'azure_subscription_id': self.get_project_id(dryrun), 'resource_group': f'{cluster_name.name_on_cloud}-{region_name}', } + # Setting disk performance tier for high disk tier. + if disk_tier == resources_utils.DiskTier.HIGH: + resources_vars['disk_performance_tier'] = 'P50' + + return resources_vars + def _get_feasible_launchable_resources( self, resources: 'resources.Resources' ) -> 'resources_utils.FeasibleResources': @@ -600,10 +607,10 @@ def check_disk_tier( disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]: if disk_tier is None or disk_tier == resources_utils.DiskTier.BEST: return True, '' - if disk_tier == resources_utils.DiskTier.HIGH or disk_tier == resources_utils.DiskTier.ULTRA: + if disk_tier == resources_utils.DiskTier.ULTRA: return False, ( - 'Azure disk_tier={high, ultra} is not supported now. ' - 'Please use disk_tier={low, medium, best} instead.') + 'Azure disk_tier=ultra is not supported now. ' + 'Please use disk_tier={low, medium, high, best} instead.') # Only S-series supported premium ssd # see https://stackoverflow.com/questions/48590520/azure-requested-operation-cannot-be-performed-because-storage-account-type-pre # pylint: disable=line-too-long if cls._get_disk_type( @@ -611,7 +618,7 @@ def check_disk_tier( ) == 'Premium_LRS' and not Azure._is_s_series(instance_type): return False, ( 'Azure premium SSDs are only supported for S-series ' - 'instances. To use disk_tier=medium, please make sure ' + 'instances. To use disk_tier>=medium, please make sure ' 'instance_type is specified to an S-series instance.') return True, '' @@ -631,7 +638,7 @@ def _get_disk_type(cls, # cannot be used as OS disks so we might need data disk support tier2name = { resources_utils.DiskTier.ULTRA: 'Disabled', - resources_utils.DiskTier.HIGH: 'Disabled', + resources_utils.DiskTier.HIGH: 'Premium_LRS', resources_utils.DiskTier.MEDIUM: 'Premium_LRS', resources_utils.DiskTier.LOW: 'Standard_LRS', } diff --git a/sky/provision/azure/instance.py b/sky/provision/azure/instance.py index 2a8d54273c2..009fb889848 100644 --- a/sky/provision/azure/instance.py +++ b/sky/provision/azure/instance.py @@ -18,6 +18,7 @@ from sky.provision import common from sky.provision import constants from sky.utils import common_utils +from sky.utils import subprocess_utils from sky.utils import ux_utils if typing.TYPE_CHECKING: @@ -274,6 +275,17 @@ def _create_instances( deployment_name=vm_name, parameters=parameters, ).wait() + + performance_tier = node_config.get('disk_performance_tier', None) + if performance_tier is not None: + disks = compute_client.disks.list_by_resource_group(resource_group) + for disk in disks: + name = disk.name + # TODO(tian): Investigate if we can use Python SDK to update this. + subprocess_utils.run_no_outputs( + f'az disk update -n {name} -g {resource_group} ' + f'--set tier={performance_tier}') + filters = { constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud, _TAG_SKYPILOT_VM_ID: vm_id diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index 65d500fc677..77ddda6652f 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -81,6 +81,9 @@ available_node_types: {{ cmd }} {%- endfor %} need_nvidia_driver_extension: {{need_nvidia_driver_extension}} + {%- if disk_performance_tier is not none %} + disk_performance_tier: {{disk_performance_tier}} + {%- endif %} # TODO: attach disk head_node_type: ray.head.default diff --git a/tests/test_optimizer_dryruns.py b/tests/test_optimizer_dryruns.py index dfda65e23da..f1af9a0d9ee 100644 --- a/tests/test_optimizer_dryruns.py +++ b/tests/test_optimizer_dryruns.py @@ -765,12 +765,12 @@ def _get_all_candidate_cloud(r: sky.Resources) -> Set[clouds.Cloud]: map(clouds.CLOUD_REGISTRY.get, ['aws', 'gcp', 'azure', 'oci'])), low_tier_candidates - # Only AWS, GCP, OCI supports HIGH disk tier. + # Only AWS, GCP, Azure, OCI supports HIGH disk tier. high_tier_resources = sky.Resources(disk_tier=resources_utils.DiskTier.HIGH) high_tier_candidates = _get_all_candidate_cloud(high_tier_resources) assert high_tier_candidates == set( map(clouds.CLOUD_REGISTRY.get, - ['aws', 'gcp', 'oci'])), high_tier_candidates + ['aws', 'gcp', 'azure', 'oci'])), high_tier_candidates # Only AWS, GCP supports ULTRA disk tier. ultra_tier_resources = sky.Resources(