Skip to content

Commit

Permalink
Support start and stop Azure instances (#316)
Browse files Browse the repository at this point in the history
* azure external node provider

* allow stopping Azure

* update token check
  • Loading branch information
suquark committed Feb 15, 2022
1 parent a11d502 commit 4da5d1f
Show file tree
Hide file tree
Showing 18 changed files with 927 additions and 25 deletions.
1 change: 1 addition & 0 deletions prototype/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
include sky/clouds/service_catalog/data/*
include sky/skylet/*.sh
include sky/skylet/providers/azure/*
include sky/skylet/providers/gcp/*
22 changes: 20 additions & 2 deletions prototype/config/azure-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ max_workers: {{num_nodes - 1}}
idle_timeout_minutes: 60

provider:
type: azure
type: external
module: sky.skylet.providers.azure.AzureNodeProvider
location: {{region}}
# Ref: https://github.com/ray-project/ray/blob/2367a2cb9033913b68b1230316496ae273c25b54/python/ray/autoscaler/_private/_azure/node_provider.py#L87
# For Azure, ray distinguishes different instances by the resource_group,
Expand All @@ -15,6 +16,8 @@ provider:
# Keep (otherwise cannot reuse when re-provisioning).
# teardown(terminate=True) will override this.
cache_stopped_nodes: True
# subscription id of the azure user
subscription_id: {{azure_subscription_id}}

auth:
ssh_user: azureuser
Expand Down Expand Up @@ -80,9 +83,12 @@ initialization_commands:
{%- for cmd in initialization_commands %}
- {{cmd}}
{%- endfor %}
{%- else %}
initialization_commands: []
{%- endif %}

# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude: []
# rsync_exclude:
# - "**/.git"
# - "**/.git/**"
Expand All @@ -95,7 +101,8 @@ setup_commands:
# This AMI's system Python is version 2+.
# _gang_schedule_ray_up() requires first two lines; do not modify.
- pip3 install -U ray[default]=={{ray_version}} && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful
- pip3 uninstall sky -y &> /dev/null; pip3 install {{sky_remote_path}}/*.whl && python3 -c "from sky.skylet.ray_patches import patch; patch()" # patch the buggy ray file
# We have to install azure-cli because the Azure cluster does not pre-install it.
- pip3 uninstall sky -y &> /dev/null; pip3 install {{sky_remote_path}}/*.whl && pip3 install azure-cli==2.30.0 && python3 -c "from sky.skylet.ray_patches import patch; patch()" # patch the buggy ray file
{%- if setup_sh_path is not none %}
- cd ~/sky_workdir && bash /tmp/setup.sh # FIXME: /tmp is volatile.
{%- endif %}
Expand All @@ -107,4 +114,15 @@ head_start_ray_commands:
{%- if num_nodes > 1 %}
worker_start_ray_commands:
- ray stop; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}}
{%- else %}
worker_start_ray_commands: []
{%- endif %}

head_node: {}
worker_nodes: {}

# These fields are required for external cloud providers.
head_setup_commands: []
worker_setup_commands: []
cluster_synced_files: []
file_mounts_sync_continuously: False
2 changes: 1 addition & 1 deletion prototype/config/gcp-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ idle_timeout_minutes: 60
provider:
# We use a custom node provider for GCP to support instance stop and reuse.
type: external # type: gcp
module: sky.skylet.providers.GCPNodeProvider
module: sky.skylet.providers.gcp.GCPNodeProvider
region: {{region}}
availability_zone: {{zones}}
project_id: intercloud-320520 # Globally unique project id
Expand Down
13 changes: 13 additions & 0 deletions prototype/examples/azure_start_stop.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# start and stop Azure instances
name: azure-start-stop

resources:
cloud: azure

num_nodes: 2

# The setup command. Will be run under the working directory.
setup: 'echo "azure-start-stop [setup]"'

# The command to run. Will be run under the working directory.
run: 'echo "azure-start-stop [run]"'
8 changes: 8 additions & 0 deletions prototype/examples/run_smoke_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,11 @@ sky stop gcp-start-stop
sky start gcp-start-stop
sky exec gcp-start-stop "$DIR"/gcp_start_stop.yaml
sky down gcp-start-stop

## ---------- Testing Azure start and stop instances ----------
sky launch -c azure-start-stop "$DIR"/azure_start_stop.yaml
sky exec azure-start-stop "$DIR"/azure_start_stop.yaml
sky stop azure-start-stop
sky start azure-start-stop
sky exec azure-start-stop "$DIR"/azure_start_stop.yaml
sky down azure-start-stop
2 changes: 1 addition & 1 deletion prototype/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
'aws': ['awscli==1.22.17', 'boto3'],
# ray <= 1.9.1 requires an older version of azure-cli. We can get rid of
# this version requirement once ray 1.10 is adopted as our local version.
'azure': ['azure-cli==2.22.0'],
'azure': ['azure-cli==2.30.0'],
'gcp': ['google-api-python-client', 'google-cloud-storage'],
}

Expand Down
19 changes: 19 additions & 0 deletions prototype/sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from sky import sky_logging
from sky import resources
from sky import task as task_lib
from sky.cloud_adaptors import azure
from sky.skylet import log_lib

logger = sky_logging.init_logger(__name__)
Expand Down Expand Up @@ -402,6 +403,22 @@ def write_cluster_config(task: task_lib.Task,
if isinstance(cloud, clouds.AWS):
aws_default_ami = cloud.get_default_ami(region)

azure_subscription_id = None
if isinstance(cloud, clouds.Azure):
if dryrun:
azure_subscription_id = 'ffffffff-ffff-ffff-ffff-ffffffffffff'
else:
try:
azure_subscription_id = azure.get_subscription_id()
if not azure_subscription_id:
raise ValueError # The error message will be replaced.
except Exception:
raise RuntimeError(
'Fail to get subscription id from azure cli. '
'Make sure you have login in and fix it with this Azure '
'cli command: "az account set -s <subscription_id>".'
) from None

assert cluster_name is not None

setup_sh_path = None
Expand Down Expand Up @@ -467,6 +484,8 @@ def write_cluster_config(task: task_lib.Task,
'zones': ','.join(zones),
# AWS only.
'aws_default_ami': aws_default_ami,
# Azure only.
'azure_subscription_id': azure_subscription_id,
# Ray version.
'ray_version': SKY_REMOTE_RAY_VERSION,
# Sky remote utils.
Expand Down
27 changes: 10 additions & 17 deletions prototype/sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -1562,22 +1562,8 @@ def teardown(self, handle: ResourceHandle, terminate: bool) -> None:
prev_status = global_user_state.get_status_from_cluster_name(
handle.cluster_name)
cluster_name = config['cluster_name']
if not terminate and not isinstance(cloud, (clouds.AWS, clouds.GCP)):
# FIXME: no mentions of cache_stopped_nodes in
# https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/_azure/node_provider.py
raise ValueError(
f'Stopping cluster {handle.cluster_name!r}: not supported on '
'non-AWS and non-GCP clusters yet. Try manually stopping, '
f'or terminate by: sky down {handle.cluster_name}')
if isinstance(cloud, clouds.Azure):
# Special handling because `ray down` is buggy with Azure.
# Set check=False to not error out on not found VMs.
backend_utils.run(
'az vm delete --yes --ids $(az vm list --query '
f'"[? contains(name, \'{cluster_name}\')].id" -o tsv)',
check=False)
elif (terminate and
prev_status == global_user_state.ClusterStatus.STOPPED):
if (terminate and
prev_status == global_user_state.ClusterStatus.STOPPED):
if isinstance(cloud, clouds.AWS):
# TODO (zhwu): Room for optimization. We can move these cloud
# specific handling to the cloud class.
Expand All @@ -1594,11 +1580,18 @@ def teardown(self, handle: ResourceHandle, terminate: bool) -> None:
f'aws ec2 terminate-instances --region {region} '
f'--instance-ids $({query_cmd})')
backend_utils.run(terminate_cmd, check=True)
elif isinstance(cloud, clouds.Azure):
# Special handling because `ray down` is buggy with Azure.
# Set check=False to not error out on not found VMs.
backend_utils.run(
'az vm delete --yes --ids $(az vm list --query '
f'"[? contains(name, \'{cluster_name}\')].id" -o tsv)',
check=False)
else:
# TODO(suquark,zongheng): Support deleting stopped GCP clusters.
# Tracked in issue #318.
logger.info(
f'Cannot terminate non-AWS cluster {cluster_name!r} '
f'Cannot terminate GCP cluster {cluster_name!r} '
'because it is STOPPED. \nTo fix: manually terminate in '
'the cloud\'s UI or '
f'`sky start {cluster_name}; sky down {cluster_name}` '
Expand Down
9 changes: 9 additions & 0 deletions prototype/sky/cloud_adaptors/azure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Azure cli adaptor"""

# pylint: disable=import-outside-toplevel


def get_subscription_id() -> str:
"""Get the default subscription id."""
from azure.common import credentials
return credentials.get_cli_profile().get_subscription_id()
5 changes: 3 additions & 2 deletions prototype/sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,11 @@ def check_credentials(self) -> Tuple[bool, Optional[str]]:
# This file is required because it will be synced to remote VMs for
# `az` to access private storage buckets.
# `az account show` does not guarantee this file exists.
if not os.path.isfile(os.path.expanduser('~/.azure/accessTokens.json')):
azure_token_cache_file = '~/.azure/msal_token_cache.json'
if not os.path.isfile(os.path.expanduser(azure_token_cache_file)):
return (
False,
'~/.azure/accessTokens.json does not exist. Run `az login`.' +
f'{azure_token_cache_file} does not exist. Run `az login`.' +
help_str)
try:
output = _run_output('az account show --output=json')
Expand Down
4 changes: 4 additions & 0 deletions prototype/sky/skylet/LICENCE
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,10 @@ https://github.com/ray-project/ray/blob/releases/1.10.0/python/ray/_private/log_
Code in ray_patches/worker.py from
https://github.com/ray-project/ray/blob/ray-1.9.2/python/ray/worker.py

Code in providers/azure from
https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/_private/_azure
Git Revision: 7f1bacc7dc9caf6d0ec042e39499bbf1d9a7d065

Code in providers/gcp from
https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/_private/gcp
Git Revision: ef9d9df4e7454d428a958281e9de333795dccb44
Expand Down
2 changes: 0 additions & 2 deletions prototype/sky/skylet/providers/__init__.py

This file was deleted.

2 changes: 2 additions & 0 deletions prototype/sky/skylet/providers/azure/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"""Azure node provider"""
from sky.skylet.providers.azure.node_provider import AzureNodeProvider
88 changes: 88 additions & 0 deletions prototype/sky/skylet/providers/azure/azure-config-template.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"subnet": {
"type": "string",
"metadata": {
"description": "The subnet to be used"
}
}
},
"variables": {
"Contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
"location": "[resourceGroup().location]"
},
"resources": [
{
"type": "Microsoft.ManagedIdentity/userAssignedIdentities",
"apiVersion": "2018-11-30",
"location": "[variables('location')]",
"name": "ray-msi-user-identity"
},
{
"type": "Microsoft.Authorization/roleAssignments",
"apiVersion": "2020-04-01-preview",
"name": "[guid(resourceGroup().id)]",
"properties": {
"principalId": "[reference('ray-msi-user-identity').principalId]",
"roleDefinitionId": "[variables('Contributor')]",
"scope": "[resourceGroup().id]",
"principalType": "ServicePrincipal"
},
"dependsOn": [
"[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', 'ray-msi-user-identity')]"
]
},
{
"type": "Microsoft.Network/networkSecurityGroups",
"apiVersion": "2019-02-01",
"name": "ray-nsg",
"location": "[variables('location')]",
"properties": {
"securityRules": [
{
"name": "SSH",
"properties": {
"priority": 1000,
"protocol": "TCP",
"access": "Allow",
"direction": "Inbound",
"sourceAddressPrefix": "*",
"sourcePortRange": "*",
"destinationAddressPrefix": "*",
"destinationPortRange": "22"
}
}
]
}
},
{
"type": "Microsoft.Network/virtualNetworks",
"apiVersion": "2019-11-01",
"name": "ray-vnet",
"location": "[variables('location')]",
"properties": {
"addressSpace": {
"addressPrefixes": [
"[parameters('subnet')]"
]
},
"subnets": [
{
"name": "ray-subnet",
"properties": {
"addressPrefix": "[parameters('subnet')]",
"networkSecurityGroup": {
"id": "[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]"
}
}
}
]
},
"dependsOn": [
"[resourceId('Microsoft.Network/networkSecurityGroups', 'ray-nsg')]"
]
}
]
}
Loading

0 comments on commit 4da5d1f

Please sign in to comment.