Skip to content

Commit

Permalink
Sky init and cloud credentials for storage (#102)
Browse files Browse the repository at this point in the history
* sky init

* fixes and lint

* Create test_enabled_clouds.py

* Optimizer is now aware of enabled_clouds

* Fix pytest

* Update registry.py

* Support GCS buckets

* Make GCS on GCP work

* yapf behaves differently across versions...

* yapf pls

* Fix Azure

* tweak messages

* tweak

* Apply hotfix from #127

* Simple fixes

* Use monkeypatch

* Address comments

* get rid of Task.enabled_clouds

* fix test

* Always install aws and gcloud utils

* Address comments

* oops

* Revert "Always install aws and gcloud utils"

This reverts commit a4630b1.

* Refactor and trigger `sky init` automatically

* Reverted to `is_directory`

* nits

* better check

* usability

* fix tests

* nits

* Address latest comments

* Update init.py

* Fix CLI messages

* Sync credentials regardless of storage mounts

* Fix cpunode and more docs

* Apply changes to requirements

* Update setup.py

* Fix tests

* Fixes

* Add links, fix test

Co-authored-by: Michael Luo <michael.luo@berkeley.edu>
Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>
  • Loading branch information
3 people committed Feb 11, 2022
1 parent f1d93c5 commit a21505e
Show file tree
Hide file tree
Showing 19 changed files with 439 additions and 78 deletions.
2 changes: 1 addition & 1 deletion prototype/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ ray attach config/gcp.yml
ray down config/gcp.yml
```

**Azure**. Install the Azure CLI (`pip install azure-cli`) then login using `az login`. Set the subscription to use from the command line (`az account set -s <subscription_id>`) or by modifying the provider section of the Azure template (`config/azure.yml.j2`). Ray Autoscaler does not work with the latest version of `azure-cli`. Hotfix: `pip install azure-cli-core==2.22.0` (this will make Ray work but at the cost of making the `az` CLI tool unusable).
**Azure**. Install the Azure CLI (`pip install azure-cli==2.22.0`) then login using `az login`. Set the subscription to use from the command line (`az account set -s <subscription_id>`). Ray Autoscaler does not work with the latest version of `azure-cli` as of 1.9.1, hence the fixed Azure version.

## Open issues

Expand Down
9 changes: 6 additions & 3 deletions prototype/examples/resnet_distributed_tf_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@
with sky.Dag() as dag:
# The working directory contains all code and will be synced to remote.
workdir = '~/Downloads/tpu'
subprocess.run(f'cd {workdir} && git checkout 222cc86',
shell=True,
check=True)
subprocess.run(
'cd ~/Downloads; '
'(git clone https://github.com/concretevitamin/tpu || true); '
f'cd {workdir} && git checkout 9459fee',
shell=True,
check=True)

docker_image = None # 'rayproject/ray-ml:latest-gpu'

Expand Down
4 changes: 3 additions & 1 deletion prototype/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@

extras_require = {
'aws': ['awscli==1.22.17', 'boto3'],
'azure': ['azure-cli'],
# ray <= 1.9.1 requires an older version of azure-cli. We can get rid of
# this version requirement once ray 1.10 is adopted as our local version.
'azure': ['azure-cli==2.22.0'],
'gcp': ['google-api-python-client', 'google-cloud-storage'],
}

Expand Down
2 changes: 0 additions & 2 deletions prototype/sky/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from sky.execution import launch, exec # pylint: disable=redefined-builtin
from sky.resources import Resources
from sky.task import Task
from sky.registry import fill_in_launchable_resources
from sky.optimizer import Optimizer, OptimizeTarget
from sky.data import Storage, StorageType

Expand All @@ -34,7 +33,6 @@
'backends',
'launch',
'exec',
'fill_in_launchable_resources',
'list_accelerators',
'__root_dir__',
'Storage',
Expand Down
12 changes: 8 additions & 4 deletions prototype/sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,10 @@ def add_cluster(
host_name = ip
logger.warning(f'Using {ip} to identify host instead.')
break
else:
config = ['\n']
with open(config_path, 'w') as f:
f.writelines(config)

codegen = cls._get_generated_config(sky_autogen_comment, host_name, ip,
username, key_path)
Expand All @@ -252,7 +256,7 @@ def add_cluster(
f.write('\n')
else:
with open(config_path, 'a') as f:
if not config[-1].endswith('\n'):
if len(config) > 0 and not config[-1].endswith('\n'):
# Add trailing newline if it doesn't exist.
f.write('\n')
f.write('\n')
Expand Down Expand Up @@ -791,9 +795,9 @@ def generate_cluster_name():
return f'sky-{uuid.uuid4().hex[:4]}-{getpass.getuser()}'


def get_backend_from_handle(handle: backends.Backend.ResourceHandle):
"""
Get a backend object from a handle.
def get_backend_from_handle(
handle: backends.Backend.ResourceHandle) -> backends.Backend:
"""Gets a Backend object corresponding to a handle.
Inspects handle type to infer the backend used for the resource.
"""
Expand Down
17 changes: 15 additions & 2 deletions prototype/sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import sky
from sky import backends
from sky import global_user_state
from sky import init as sky_init
from sky import sky_logging
from sky import clouds
from sky.backends import backend as backend_lib
Expand Down Expand Up @@ -79,7 +80,7 @@ def _truncate_long_string(s: str, max_length: int = 50) -> str:
return s
splits = s.split(' ')
if len(splits[0]) > max_length:
return splits[0][:max_length] + '...'
return splits[0][:max_length] + '...' # Use '…'?
# Truncate on word boundary.
i = 0
total = 0
Expand Down Expand Up @@ -279,6 +280,7 @@ def _create_and_ssh_into_node(
run='',
)
task.set_resources(resources)
task.update_file_mounts(sky_init.get_cloud_credential_file_mounts())

backend = backend if backend is not None else backends.CloudVmRayBackend()
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
Expand Down Expand Up @@ -1056,7 +1058,8 @@ def _terminate_or_stop_clusters(names: Tuple[str], apply_to_all: Optional[bool],
name = record['name']
handle = record['handle']
backend = backend_utils.get_backend_from_handle(handle)
if handle.launched_resources.use_spot and not terminate:
if (isinstance(backend, backends.CloudVmRayBackend) and
handle.launched_resources.use_spot and not terminate):
# TODO(suquark): enable GCP+spot to be stopped in the future.
click.secho(
f'Stopping cluster {name}... skipped, because spot instances '
Expand Down Expand Up @@ -1293,6 +1296,16 @@ def tpunode(cluster: str, port_forward: Optional[List[int]],


@cli.command()
def init():
"""Determines a set of clouds that Sky will use.
It checks access credentials for AWS, Azure and GCP. Sky tasks will only
run in clouds that you have access to. After configuring access for a
cloud, rerun `sky init` to reflect the changes.
"""
sky_init.init()


@click.argument('gpu_name', required=False)
@click.option('--all',
'-a',
Expand Down
2 changes: 0 additions & 2 deletions prototype/sky/cloud_stores.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
TODO:
* Better interface.
* Better implementation (e.g., fsspec, smart_open, using each cloud's SDK).
The full-blown impl should handle authentication so each user's private
datasets can be accessed.
"""
import subprocess
import urllib.parse
Expand Down
6 changes: 6 additions & 0 deletions prototype/sky/clouds/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,20 @@
__all__ = [
'AWS',
'Azure',
'CLOUD_REGISTRY',
'Cloud',
'GCP',
'Region',
'Zone',
'from_str',
]

CLOUD_REGISTRY = {
'aws': AWS(),
'gcp': GCP(),
'azure': Azure(),
}


def from_str(name: str) -> 'Cloud':
return CLOUD_REGISTRY[name.lower()]
54 changes: 54 additions & 0 deletions prototype/sky/clouds/aws.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Amazon Web Services."""
import copy
import json
import os
import subprocess
from typing import Dict, Iterator, List, Optional, Tuple, TYPE_CHECKING

from sky import clouds
Expand All @@ -11,6 +13,15 @@
from sky import resources as resources_lib


def _run_output(cmd):
proc = subprocess.run(cmd,
shell=True,
check=True,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE)
return proc.stdout.decode('ascii')


class AWS(clouds.Cloud):
"""Amazon Web Services."""

Expand Down Expand Up @@ -187,3 +198,46 @@ def _make(instance_type):
if instance_type is None:
return []
return _make(instance_type)

def check_credentials(self) -> Tuple[bool, Optional[str]]:
"""Checks if the user has access credentials to this cloud."""
help_str = (
'\n For more info: '
'https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html' # pylint: disable=line-too-long
)
# This file is required because it will be synced to remote VMs for
# `aws` to access private storage buckets.
# `aws configure list` does not guarantee this file exists.
if not os.path.isfile(os.path.expanduser('~/.aws/credentials')):
return (False,
'~/.aws/credentials does not exist. Run `aws configure`.' +
help_str)
try:
output = _run_output('aws configure list')
except subprocess.CalledProcessError:
return False, 'AWS CLI not installed properly.'
# Configured correctly, the AWS output should look like this:
# ...
# access_key ******************** shared-credentials-file
# secret_key ******************** shared-credentials-file
# ...
# Otherwise, one or both keys will show as '<not set>'.
lines = output.split('\n')
if len(lines) < 2:
return False, 'AWS CLI output invalid.'
access_key_ok = False
secret_key_ok = False
for line in lines[2:]:
line = line.lstrip()
if line.startswith('access_key'):
if '<not set>' not in line:
access_key_ok = True
elif line.startswith('secret_key'):
if '<not set>' not in line:
secret_key_ok = True
if access_key_ok and secret_key_ok:
return True, None
return False, 'AWS credentials not set. Run `aws configure`.' + help_str

def get_credential_file_mounts(self) -> Dict[str, str]:
return {'~/.aws': '~/.aws'}
40 changes: 40 additions & 0 deletions prototype/sky/clouds/azure.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,23 @@
"""Azure."""
import copy
import json
import os
import subprocess
from typing import Dict, Iterator, List, Optional, Tuple

from sky import clouds
from sky.clouds.service_catalog import azure_catalog


def _run_output(cmd):
proc = subprocess.run(cmd,
shell=True,
check=True,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE)
return proc.stdout.decode('ascii')


class Azure(clouds.Cloud):
"""Azure."""

Expand Down Expand Up @@ -147,3 +158,32 @@ def _make(instance_type):
if instance_type is None:
return []
return _make(instance_type)

def check_credentials(self) -> Tuple[bool, Optional[str]]:
"""Checks if the user has access credentials to this cloud."""
help_str = (
'\n For more info: '
'https://docs.microsoft.com/en-us/cli/azure/get-started-with-azure-cli' # pylint: disable=line-too-long
)
# This file is required because it will be synced to remote VMs for
# `az` to access private storage buckets.
# `az account show` does not guarantee this file exists.
if not os.path.isfile(os.path.expanduser('~/.azure/accessTokens.json')):
return (
False,
'~/.azure/accessTokens.json does not exist. Run `az login`.' +
help_str)
try:
output = _run_output('az account show --output=json')
except subprocess.CalledProcessError:
return False, 'Azure CLI returned error.'
# If Azure is properly logged in, this will return something like:
# {"id": ..., "user": ...}
# and if not, it will return:
# Please run 'az login' to setup account.
if output.startswith('{'):
return True, None
return False, 'Azure credentials not set. Run `az login`.' + help_str

def get_credential_file_mounts(self) -> Dict[str, str]:
return {'~/.azure': '~/.azure'}
14 changes: 14 additions & 0 deletions prototype/sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,17 @@ def get_feasible_launchable_resources(self, resources):
Launchable resources require a cloud and an instance type be assigned.
"""
raise NotImplementedError

def check_credentials(self) -> Tuple[bool, Optional[str]]:
"""Checks if the user has access credentials to this cloud.
Returns a boolean of whether the user can access this cloud, and a
string describing the reason if the user cannot access.
"""
raise NotImplementedError

def get_credential_file_mounts(self) -> Dict[str, str]:
"""Returns the files necessary to access this cloud.
Returns a dictionary that will be added to a task's file mounts."""
raise NotImplementedError
37 changes: 37 additions & 0 deletions prototype/sky/clouds/gcp.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""Google Cloud Platform."""
import copy
import json
import os
from typing import Dict, Iterator, List, Optional, Tuple

from google import auth

from sky import clouds
from sky.clouds.service_catalog import gcp_catalog

Expand Down Expand Up @@ -217,3 +220,37 @@ def get_accelerators_from_instance_type(
# GCP handles accelerators separately from regular instance types,
# hence return none here.
return None

def check_credentials(self) -> Tuple[bool, Optional[str]]:
"""Checks if the user has access credentials to this cloud."""
try:
# These files are required because they will be synced to remote
# VMs for `gsutil` to access private storage buckets.
# `auth.default()` does not guarantee these files exist.
for file in [
'~/.config/gcloud/access_tokens.db',
'~/.config/gcloud/credentials.db'
]:
assert os.path.isfile(os.path.expanduser(file))
# Calling `auth.default()` ensures the GCP client library works,
# which is used by Ray Autoscaler to launch VMs.
auth.default()
except (AssertionError, auth.exceptions.DefaultCredentialsError):
# See also: https://stackoverflow.com/a/53307505/1165051
return False, (
'GCP credentials not set. Run the following commands:\n '
# This authenticates the CLI to make `gsutil` work:
'$ gcloud auth login\n '
'$ gcloud config set project <proj>\n '
# These two commands setup the client library to make
# Ray Autoscaler work:
'$ gcloud auth application-default login\n '
'$ gcloud auth application-default set-quota-project '
'<proj>\n '
'For more info: '
'https://googleapis.dev/python/google-api-core/latest/auth.html'
)
return True, None

def get_credential_file_mounts(self) -> Dict[str, str]:
return {'~/.config/gcloud': '~/.config/gcloud'}
14 changes: 12 additions & 2 deletions prototype/sky/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import sky
from sky import backends
from sky import init
from sky import global_user_state
from sky import sky_logging
from sky import optimizer
Expand Down Expand Up @@ -84,16 +85,25 @@ def _execute(dag: sky.Dag,
cluster_name)
cluster_exists = existing_handle is not None

backend = backend if backend is not None else backends.CloudVmRayBackend()

if not cluster_exists and (stages is None or Stage.OPTIMIZE in stages):
if task.best_resources is None:
# TODO: fix this for the situation where number of requested
# accelerators is not an integer.
dag = sky.optimize(dag, minimize=optimize_target)
if isinstance(backend, backends.CloudVmRayBackend):
# TODO: adding this check because docker backend on a
# no-credential machine should not enter optimize(), which
# would directly error out ('No cloud is enabled...'). Fix by
# moving sky init checks out of optimize()?
dag = sky.optimize(dag, minimize=optimize_target)
task = dag.tasks[0] # Keep: dag may have been deep-copied.

backend = backend if backend is not None else backends.CloudVmRayBackend()
backend.register_info(dag=dag, optimize_target=optimize_target)

# FIXME: test on some node where the mounts do not exist.
task.update_file_mounts(init.get_cloud_credential_file_mounts())

if task.storage_mounts is not None:
# Optimizer should eventually choose where to store bucket
task.add_storage_mounts()
Expand Down
Loading

0 comments on commit a21505e

Please sign in to comment.