Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support start and stop Azure instances #316

Merged
merged 5 commits into from
Feb 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions prototype/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
include sky/clouds/service_catalog/data/*
include sky/skylet/*.sh
include sky/skylet/providers/azure/*
include sky/skylet/providers/gcp/*
22 changes: 20 additions & 2 deletions prototype/config/azure-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ max_workers: {{num_nodes - 1}}
idle_timeout_minutes: 60

provider:
type: azure
type: external
module: sky.skylet.providers.azure.AzureNodeProvider
location: {{region}}
# Ref: https://github.com/ray-project/ray/blob/2367a2cb9033913b68b1230316496ae273c25b54/python/ray/autoscaler/_private/_azure/node_provider.py#L87
# For Azure, ray distinguishes different instances by the resource_group,
Expand All @@ -15,6 +16,8 @@ provider:
# Keep (otherwise cannot reuse when re-provisioning).
# teardown(terminate=True) will override this.
cache_stopped_nodes: True
# subscription id of the azure user
subscription_id: {{azure_subscription_id}}

auth:
ssh_user: azureuser
Expand Down Expand Up @@ -80,9 +83,12 @@ initialization_commands:
{%- for cmd in initialization_commands %}
- {{cmd}}
{%- endfor %}
{%- else %}
initialization_commands: []
{%- endif %}

# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude: []
# rsync_exclude:
# - "**/.git"
# - "**/.git/**"
Expand All @@ -95,7 +101,8 @@ setup_commands:
# This AMI's system Python is version 2+.
# _gang_schedule_ray_up() requires first two lines; do not modify.
- pip3 install -U ray[default]=={{ray_version}} && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful
- pip3 uninstall sky -y &> /dev/null; pip3 install {{sky_remote_path}}/*.whl && python3 -c "from sky.skylet.ray_patches import patch; patch()" # patch the buggy ray file
# We have to install azure-cli because the Azure cluster does not pre-install it.
- pip3 uninstall sky -y &> /dev/null; pip3 install {{sky_remote_path}}/*.whl && pip3 install azure-cli==2.30.0 && python3 -c "from sky.skylet.ray_patches import patch; patch()" # patch the buggy ray file
{%- if setup_sh_path is not none %}
- cd ~/sky_workdir && bash /tmp/setup.sh # FIXME: /tmp is volatile.
{%- endif %}
Expand All @@ -107,4 +114,15 @@ head_start_ray_commands:
{%- if num_nodes > 1 %}
worker_start_ray_commands:
- ray stop; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}}
{%- else %}
worker_start_ray_commands: []
{%- endif %}

head_node: {}
worker_nodes: {}

# These fields are required for external cloud providers.
head_setup_commands: []
worker_setup_commands: []
cluster_synced_files: []
file_mounts_sync_continuously: False
2 changes: 1 addition & 1 deletion prototype/config/gcp-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ idle_timeout_minutes: 60
provider:
# We use a custom node provider for GCP to support instance stop and reuse.
type: external # type: gcp
module: sky.skylet.providers.GCPNodeProvider
module: sky.skylet.providers.gcp.GCPNodeProvider
region: {{region}}
availability_zone: {{zones}}
project_id: intercloud-320520 # Globally unique project id
Expand Down
13 changes: 13 additions & 0 deletions prototype/examples/azure_start_stop.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# start and stop Azure instances
name: azure-start-stop

resources:
cloud: azure

num_nodes: 2

# The setup command. Will be run under the working directory.
setup: 'echo "azure-start-stop [setup]"'

# The command to run. Will be run under the working directory.
run: 'echo "azure-start-stop [run]"'
8 changes: 8 additions & 0 deletions prototype/examples/run_smoke_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,11 @@ sky stop gcp-start-stop
sky start gcp-start-stop
sky exec gcp-start-stop "$DIR"/gcp_start_stop.yaml
sky down gcp-start-stop

## ---------- Testing Azure start and stop instances ----------
sky launch -c azure-start-stop "$DIR"/azure_start_stop.yaml
sky exec azure-start-stop "$DIR"/azure_start_stop.yaml
sky stop azure-start-stop
sky start azure-start-stop
sky exec azure-start-stop "$DIR"/azure_start_stop.yaml
sky down azure-start-stop
2 changes: 1 addition & 1 deletion prototype/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
'aws': ['awscli==1.22.17', 'boto3'],
# ray <= 1.9.1 requires an older version of azure-cli. We can get rid of
# this version requirement once ray 1.10 is adopted as our local version.
'azure': ['azure-cli==2.22.0'],
'azure': ['azure-cli==2.30.0'],
'gcp': ['google-api-python-client', 'google-cloud-storage'],
}

Expand Down
19 changes: 19 additions & 0 deletions prototype/sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from sky import sky_logging
from sky import resources
from sky import task as task_lib
from sky.cloud_adaptors import azure
suquark marked this conversation as resolved.
Show resolved Hide resolved
from sky.skylet import log_lib

logger = sky_logging.init_logger(__name__)
Expand Down Expand Up @@ -402,6 +403,22 @@ def write_cluster_config(task: task_lib.Task,
if isinstance(cloud, clouds.AWS):
aws_default_ami = cloud.get_default_ami(region)

azure_subscription_id = None
if isinstance(cloud, clouds.Azure):
if dryrun:
azure_subscription_id = 'ffffffff-ffff-ffff-ffff-ffffffffffff'
else:
try:
azure_subscription_id = azure.get_subscription_id()
if not azure_subscription_id:
raise ValueError # The error message will be replaced.
except Exception:
raise RuntimeError(
'Fail to get subscription id from azure cli. '
'Make sure you have login in and fix it with this Azure '
'cli command: "az account set -s <subscription_id>".'
) from None

assert cluster_name is not None

setup_sh_path = None
Expand Down Expand Up @@ -467,6 +484,8 @@ def write_cluster_config(task: task_lib.Task,
'zones': ','.join(zones),
# AWS only.
'aws_default_ami': aws_default_ami,
# Azure only.
'azure_subscription_id': azure_subscription_id,
# Ray version.
'ray_version': SKY_REMOTE_RAY_VERSION,
# Sky remote utils.
Expand Down
27 changes: 10 additions & 17 deletions prototype/sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -1562,22 +1562,8 @@ def teardown(self, handle: ResourceHandle, terminate: bool) -> None:
prev_status = global_user_state.get_status_from_cluster_name(
handle.cluster_name)
cluster_name = config['cluster_name']
if not terminate and not isinstance(cloud, (clouds.AWS, clouds.GCP)):
# FIXME: no mentions of cache_stopped_nodes in
# https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/_azure/node_provider.py
raise ValueError(
f'Stopping cluster {handle.cluster_name!r}: not supported on '
'non-AWS and non-GCP clusters yet. Try manually stopping, '
f'or terminate by: sky down {handle.cluster_name}')
if isinstance(cloud, clouds.Azure):
# Special handling because `ray down` is buggy with Azure.
# Set check=False to not error out on not found VMs.
backend_utils.run(
'az vm delete --yes --ids $(az vm list --query '
f'"[? contains(name, \'{cluster_name}\')].id" -o tsv)',
check=False)
elif (terminate and
prev_status == global_user_state.ClusterStatus.STOPPED):
if (terminate and
prev_status == global_user_state.ClusterStatus.STOPPED):
if isinstance(cloud, clouds.AWS):
# TODO (zhwu): Room for optimization. We can move these cloud
# specific handling to the cloud class.
Expand All @@ -1594,11 +1580,18 @@ def teardown(self, handle: ResourceHandle, terminate: bool) -> None:
f'aws ec2 terminate-instances --region {region} '
f'--instance-ids $({query_cmd})')
backend_utils.run(terminate_cmd, check=True)
elif isinstance(cloud, clouds.Azure):
# Special handling because `ray down` is buggy with Azure.
# Set check=False to not error out on not found VMs.
backend_utils.run(
'az vm delete --yes --ids $(az vm list --query '
f'"[? contains(name, \'{cluster_name}\')].id" -o tsv)',
check=False)
else:
# TODO(suquark,zongheng): Support deleting stopped GCP clusters.
# Tracked in issue #318.
logger.info(
f'Cannot terminate non-AWS cluster {cluster_name!r} '
f'Cannot terminate GCP cluster {cluster_name!r} '
'because it is STOPPED. \nTo fix: manually terminate in '
'the cloud\'s UI or '
f'`sky start {cluster_name}; sky down {cluster_name}` '
Expand Down
9 changes: 9 additions & 0 deletions prototype/sky/cloud_adaptors/azure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Azure cli adaptor"""

# pylint: disable=import-outside-toplevel


def get_subscription_id() -> str:
"""Get the default subscription id."""
from azure.common import credentials
return credentials.get_cli_profile().get_subscription_id()
5 changes: 3 additions & 2 deletions prototype/sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,11 @@ def check_credentials(self) -> Tuple[bool, Optional[str]]:
# This file is required because it will be synced to remote VMs for
# `az` to access private storage buckets.
# `az account show` does not guarantee this file exists.
if not os.path.isfile(os.path.expanduser('~/.azure/accessTokens.json')):
azure_token_cache_file = '~/.azure/msal_token_cache.json'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the difference between:

  • ~/.azure/accessTokens.json
  • ~/.azure/msal_token_cache.json (sounds like a cache)?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

~/.azure/accessTokens.json is the cache for older azure-cli, ~/.azure/msal_token_cache.json is the cache for the new version

if not os.path.isfile(os.path.expanduser(azure_token_cache_file)):
return (
False,
'~/.azure/accessTokens.json does not exist. Run `az login`.' +
f'{azure_token_cache_file} does not exist. Run `az login`.' +
help_str)
try:
output = _run_output('az account show --output=json')
Expand Down
4 changes: 4 additions & 0 deletions prototype/sky/skylet/LICENCE
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,10 @@ https://github.com/ray-project/ray/blob/releases/1.10.0/python/ray/_private/log_
Code in ray_patches/worker.py from
https://github.com/ray-project/ray/blob/ray-1.9.2/python/ray/worker.py

Code in providers/azure from
https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/_private/_azure
Git Revision: 7f1bacc7dc9caf6d0ec042e39499bbf1d9a7d065

Code in providers/gcp from
https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/_private/gcp
Git Revision: ef9d9df4e7454d428a958281e9de333795dccb44
Expand Down
2 changes: 0 additions & 2 deletions prototype/sky/skylet/providers/__init__.py

This file was deleted.

2 changes: 2 additions & 0 deletions prototype/sky/skylet/providers/azure/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"""Azure node provider"""
from sky.skylet.providers.azure.node_provider import AzureNodeProvider
88 changes: 88 additions & 0 deletions prototype/sky/skylet/providers/azure/azure-config-template.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"subnet": {
"type": "string",
"metadata": {
"description": "The subnet to be used"
}
}
},
"variables": {
"Contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
"location": "[resourceGroup().location]"
},
"resources": [
{
"type": "Microsoft.ManagedIdentity/userAssignedIdentities",
"apiVersion": "2018-11-30",
"location": "[variables('location')]",
"name": "ray-msi-user-identity"
},
{
"type": "Microsoft.Authorization/roleAssignments",
"apiVersion": "2020-04-01-preview",
"name": "[guid(resourceGroup().id)]",
"properties": {
"principalId": "[reference('ray-msi-user-identity').principalId]",
"roleDefinitionId": "[variables('Contributor')]",
"scope": "[resourceGroup().id]",
"principalType": "ServicePrincipal"
},
"dependsOn": [
"[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', 'ray-msi-user-identity')]"
]
},
{
"type": "Microsoft.Network/networkSecurityGroups",
"apiVersion": "2019-02-01",
"name": "ray-nsg",
"location": "[variables('location')]",
"properties": {
"securityRules": [
{
"name": "SSH",
"properties": {
"priority": 1000,
"protocol": "TCP",
"access": "Allow",
"direction": "Inbound",
"sourceAddressPrefix": "*",
"sourcePortRange": "*",
"destinationAddressPrefix": "*",
"destinationPortRange": "22"
}
}
]
}
},
{
"type": "Microsoft.Network/virtualNetworks",
"apiVersion": "2019-11-01",
"name": "ray-vnet",
"location": "[variables('location')]",
"properties": {
"addressSpace": {
"addressPrefixes": [
"[parameters('subnet')]"
]
},
"subnets": [
{
"name": "ray-subnet",
"properties": {
"addressPrefix": "[parameters('subnet')]",
"networkSecurityGroup": {
"id": "[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]"
}
}
}
]
},
"dependsOn": [
"[resourceId('Microsoft.Network/networkSecurityGroups', 'ray-nsg')]"
]
}
]
}
Loading