Skip to content

Commit

Permalink
[k8s] Update k8s docker image to use ray 2.9.3 (#3350)
Browse files Browse the repository at this point in the history
* Update to ray 2.9.3

* remove serve tests for k8s

* remove tests with autostop for k8s
  • Loading branch information
romilbhardwaj committed Mar 22, 2024
1 parent 4e93be5 commit 0b323bb
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 4 deletions.
2 changes: 1 addition & 1 deletion Dockerfile_k8s
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ USER sky
# Install SkyPilot pip dependencies preemptively to speed up provisioning time
RUN pip install wheel Click colorama cryptography jinja2 jsonschema && \
pip install networkx oauth2client pandas pendulum PrettyTable && \
pip install ray[default]==2.4.0 rich tabulate filelock && \
pip install ray[default]==2.9.3 rich tabulate filelock && \
pip install packaging 'protobuf<4.0.0' pulp && \
pip install pycryptodome==3.12.0 && \
pip install docker kubernetes==28.1.0
Expand Down
5 changes: 2 additions & 3 deletions Dockerfile_k8s_gpu
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# TODO(romilb) - The base image used here (ray) is very large (11.4GB).
# as a result, this built image is about 13.5GB. We need to pick a lighter base
# image.
FROM rayproject/ray:2.4.0-py310-gpu
FROM rayproject/ray:2.9.3-py310-gpu

# Initialize conda for root user, install ssh and other local dependencies
# We remove cuda lists to avoid conflicts with the cuda version installed by ray
Expand Down Expand Up @@ -39,8 +39,7 @@ RUN pip install wheel Click colorama cryptography jinja2 jsonschema && \
pip install rich tabulate filelock && \
pip install packaging 'protobuf<4.0.0' pulp && \
pip install pycryptodome==3.12.0 && \
pip install docker kubernetes==28.1.0 && \
pip install -U boto3==1.34.46 # Ray base image has a stale boto3 version - https://github.com/skypilot-org/skypilot/issues/3158
pip install docker kubernetes==28.1.0

# Add /home/sky/.local/bin/ to PATH
RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc
Expand Down
12 changes: 12 additions & 0 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,7 @@ def test_image_no_conda():
run_one_test(test)


@pytest.mark.no_kubernetes # Kubernetes does not support stopping instances
def test_custom_default_conda_env(generic_cloud: str):
name = _get_cluster_name()
test = Test('custom_default_conda_env', [
Expand Down Expand Up @@ -3027,6 +3028,7 @@ def test_skyserve_azure_http():


@pytest.mark.serve
@pytest.mark.no_kubernetes
def test_skyserve_llm(generic_cloud: str):
"""Test skyserve with real LLM usecase"""
name = _get_service_name()
Expand Down Expand Up @@ -3084,6 +3086,7 @@ def test_skyserve_spot_recovery():


@pytest.mark.serve
@pytest.mark.no_kubernetes
def test_skyserve_base_ondemand_fallback(generic_cloud: str):
name = _get_service_name()
test = Test(
Expand Down Expand Up @@ -3147,6 +3150,7 @@ def test_skyserve_dynamic_ondemand_fallback():


@pytest.mark.serve
@pytest.mark.no_kubernetes
def test_skyserve_user_bug_restart(generic_cloud: str):
"""Tests that we restart the service after user bug."""
# TODO(zhwu): this behavior needs some rethinking.
Expand Down Expand Up @@ -3180,6 +3184,7 @@ def test_skyserve_user_bug_restart(generic_cloud: str):


@pytest.mark.serve
@pytest.mark.no_kubernetes
def test_skyserve_load_balancer(generic_cloud: str):
"""Test skyserve load balancer round-robin policy"""
name = _get_service_name()
Expand All @@ -3203,6 +3208,7 @@ def test_skyserve_load_balancer(generic_cloud: str):

@pytest.mark.gcp
@pytest.mark.serve
@pytest.mark.no_kubernetes
def test_skyserve_auto_restart():
"""Test skyserve with auto restart"""
name = _get_service_name()
Expand Down Expand Up @@ -3244,6 +3250,7 @@ def test_skyserve_auto_restart():


@pytest.mark.serve
@pytest.mark.no_kubernetes
def test_skyserve_cancel(generic_cloud: str):
"""Test skyserve with cancel"""
name = _get_service_name()
Expand All @@ -3269,6 +3276,7 @@ def test_skyserve_cancel(generic_cloud: str):


@pytest.mark.serve
@pytest.mark.no_kubernetes
def test_skyserve_update(generic_cloud: str):
"""Test skyserve with update"""
name = _get_service_name()
Expand Down Expand Up @@ -3297,6 +3305,7 @@ def test_skyserve_update(generic_cloud: str):


@pytest.mark.serve
@pytest.mark.no_kubernetes
def test_skyserve_rolling_update(generic_cloud: str):
"""Test skyserve with rolling update"""
name = _get_service_name()
Expand Down Expand Up @@ -3333,6 +3342,7 @@ def test_skyserve_rolling_update(generic_cloud: str):


@pytest.mark.serve
@pytest.mark.no_kubernetes
def test_skyserve_fast_update(generic_cloud: str):
"""Test skyserve with fast update (Increment version of old replicas)"""
name = _get_service_name()
Expand Down Expand Up @@ -3374,6 +3384,7 @@ def test_skyserve_fast_update(generic_cloud: str):


@pytest.mark.serve
@pytest.mark.no_kubernetes
def test_skyserve_update_autoscale(generic_cloud: str):
"""Test skyserve update with autoscale"""
name = _get_service_name()
Expand Down Expand Up @@ -3411,6 +3422,7 @@ def test_skyserve_update_autoscale(generic_cloud: str):

@pytest.mark.serve
@pytest.mark.parametrize('mode', ['rolling', 'blue_green'])
@pytest.mark.no_kubernetes
def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str):
"""Test skyserve with update that changes autoscaler"""
name = _get_service_name() + mode
Expand Down

0 comments on commit 0b323bb

Please sign in to comment.