Skip to content

Commit

Permalink
Always install aws and gcloud utils
Browse files Browse the repository at this point in the history
  • Loading branch information
franklsf95 committed Dec 30, 2021
1 parent f13fc34 commit a4630b1
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 54 deletions.
46 changes: 30 additions & 16 deletions prototype/sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,35 @@ def make_safe_symlink_command(
return ' && '.join(commands)


def _make_setup_sh(task: task_lib.Task) -> Optional[str]:
"""Make the setup script and return its path."""
gcloud_sdk_url = (
'https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/'
'google-cloud-sdk-367.0.0-linux-x86_64.tar.gz')
codegen = textwrap.dedent(f"""#!/bin/bash
# Install AWS CLI
pip install awscli
# Install Google Cloud SDK
wget --quiet {gcloud_sdk_url}
tar xzf google-cloud-sdk-367.0.0-linux-x86_64.tar.gz
mv google-cloud-sdk ~
~/google-cloud-sdk/install.sh -q
. $(conda info --base)/etc/profile.d/conda.sh
# Task setup
{task.setup or ''}
""")
# Use a stable path, /<tempdir>/sky_setup_<checksum>.sh, because
# rerunning the same task without any changes to the content of the
# setup command should skip the setup step. Using NamedTemporaryFile()
# would generate a random path every time, hence re-triggering setup.
checksum = zlib.crc32(codegen.encode())
tempdir = tempfile.gettempdir()
# TODO: file lock on this path, in case tasks have the same setup cmd.
with open(os.path.join(tempdir, f'sky_setup_{checksum}.sh'), 'w') as f:
f.write(codegen)
return f.name


# TODO: too many things happening here - leaky abstraction. Refactor.
def write_cluster_config(run_id: RunId,
task: task_lib.Task,
Expand Down Expand Up @@ -200,22 +229,7 @@ def write_cluster_config(run_id: RunId,

assert cluster_name is not None

setup_sh_path = None
if task.setup is not None:
codegen = textwrap.dedent(f"""#!/bin/bash
. $(conda info --base)/etc/profile.d/conda.sh
{task.setup}
""")
# Use a stable path, /<tempdir>/sky_setup_<checksum>.sh, because
# rerunning the same task without any changes to the content of the
# setup command should skip the setup step. Using NamedTemporaryFile()
# would generate a random path every time, hence re-triggering setup.
checksum = zlib.crc32(codegen.encode())
tempdir = tempfile.gettempdir()
# TODO: file lock on this path, in case tasks have the same setup cmd.
with open(os.path.join(tempdir, f'sky_setup_{checksum}.sh'), 'w') as f:
f.write(codegen)
setup_sh_path = f.name
setup_sh_path = _make_setup_sh(task)

# File mounts handling for remote paths possibly without write access:
# (1) in 'file_mounts' sections, add <prefix> to these target paths.
Expand Down
44 changes: 6 additions & 38 deletions prototype/sky/cloud_stores.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,6 @@ def make_sync_file_command(self, source: str, destination: str) -> str:
class S3CloudStorage(CloudStorage):
"""AWS Cloud Storage."""

# List of commands to install AWS CLI
_GET_AWSCLI = [
'pip install awscli',
]

def is_file(self, url: str) -> bool:
"""Returns whether <url> is a regular file."""
bucket_name, path = data_utils.split_s3_path(url)
Expand All @@ -62,21 +57,13 @@ def make_sync_dir_command(self, source: str, destination: str) -> str:
# AWS Sync by default uses 10 threads to upload files to the bucket.
# To increase parallelism, modify max_concurrent_requests in your
# aws config file (Default path: ~/.aws/config).
download_via_awscli = f'mkdir -p {destination} && \
aws s3 sync {source} {destination} --delete'

all_commands = list(self._GET_AWSCLI)
all_commands.append(download_via_awscli)
return ' && '.join(all_commands)
return (f'mkdir -p {destination} && '
f'aws s3 sync {source} {destination} --delete')

def make_sync_file_command(self, source: str, destination: str) -> str:
"""Downloads a file using AWS CLI."""
download_via_awscli = f'mkdir -p {destination} && \
aws s3 cp {source} {destination}'

all_commands = list(self._GET_AWSCLI)
all_commands.append(download_via_awscli)
return ' && '.join(all_commands)
return (f'mkdir -p {destination} && '
f'aws s3 cp {source} {destination}')


class GcsCloudStorage(CloudStorage):
Expand All @@ -85,18 +72,6 @@ class GcsCloudStorage(CloudStorage):
# We use gsutil as a basic implementation. One pro is that its -m
# multi-threaded download is nice, which frees us from implementing
# parellel workers on our end.
_GET_GSUTIL = [
# Skip if gsutil already exists.
'pushd /tmp &>/dev/null',
'(test -f ~/google-cloud-sdk/bin/gsutil || (wget --quiet '
'https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/'
'google-cloud-sdk-367.0.0-linux-x86_64.tar.gz && '
'tar xzf google-cloud-sdk-367.0.0-linux-x86_64.tar.gz && '
'mv google-cloud-sdk ~/ && '
'~/google-cloud-sdk/install.sh -q ))',
'popd &>/dev/null',
]

_GSUTIL = '~/google-cloud-sdk/bin/gsutil'

def is_file(self, url: str) -> bool:
Expand All @@ -116,18 +91,11 @@ def is_file(self, url: str) -> bool:

def make_sync_dir_command(self, source: str, destination: str) -> str:
"""Downloads a directory using gsutil."""
download_via_gsutil = (
f'{self._GSUTIL} -m rsync -d -r {source} {destination}')
all_commands = list(self._GET_GSUTIL)
all_commands.append(download_via_gsutil)
return ' && '.join(all_commands)
return f'{self._GSUTIL} -m rsync -d -r {source} {destination}'

def make_sync_file_command(self, source: str, destination: str) -> str:
"""Downloads a file using gsutil."""
download_via_gsutil = f'{self._GSUTIL} -m cp {source} {destination}'
all_commands = list(self._GET_GSUTIL)
all_commands.append(download_via_gsutil)
return ' && '.join(all_commands)
return f'{self._GSUTIL} -m cp {source} {destination}'


def get_storage_from_path(url: str) -> CloudStorage:
Expand Down

0 comments on commit a4630b1

Please sign in to comment.