Skip to content

Commit

Permalink
Switch multinode_cpu_cluster to `multinode_cpu_docker_conda_cluster…
Browse files Browse the repository at this point in the history
…`. (#1253)
  • Loading branch information
rohinb2 committed Sep 11, 2024
1 parent df43988 commit 4b5a6b3
Show file tree
Hide file tree
Showing 8 changed files with 46 additions and 35 deletions.
6 changes: 3 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class TestCluster(tests.test_resources.test_resource.TestResource):
"docker_cluster_pwd_ssh_no_auth",
"ondemand_aws_docker_cluster",
"static_cpu_pwd_cluster",
"multinode_cpu_cluster"
"multinode_cpu_docker_conda_cluster"
]
}
Expand Down Expand Up @@ -236,7 +236,7 @@ def event_loop():
from tests.fixtures.on_demand_cluster_fixtures import (
a10g_gpu_cluster, # noqa: F401
k80_gpu_cluster, # noqa: F401
multinode_cpu_cluster, # noqa: F401
multinode_cpu_docker_conda_cluster, # noqa: F401
multinode_gpu_cluster, # noqa: F401
ondemand_aws_docker_cluster, # noqa: F401
ondemand_aws_https_cluster_with_auth, # noqa: F401
Expand Down Expand Up @@ -375,7 +375,7 @@ def event_loop():
"ondemand_k8s_cluster",
"ondemand_k8s_docker_cluster",
"ondemand_aws_https_cluster_with_auth",
"multinode_cpu_cluster",
"multinode_cpu_docker_conda_cluster",
"static_cpu_pwd_cluster",
"multinode_gpu_cluster", # for testing cluster status on multinode gpu.
]
Expand Down
8 changes: 6 additions & 2 deletions tests/fixtures/on_demand_cluster_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,16 @@ def a10g_gpu_cluster(request):


@pytest.fixture(scope="session")
def multinode_cpu_cluster(request):
def multinode_cpu_docker_conda_cluster(request):
args = {
"name": "rh-cpu-multinode",
"num_instances": NUM_OF_INSTANCES,
"image_id": "docker:rayproject/ray:latest-py311-cpu",
"default_env": rh.env(reqs=["ray==2.30.0"], working_dir=None),
"default_env": rh.conda_env(
name="default_env",
reqs=test_env().reqs + ["ray==2.30.0"],
conda_env={"dependencies": ["python=3.11"], "name": "default_env"},
),
"provider": "aws",
"instance_type": "CPU:2+",
}
Expand Down
2 changes: 1 addition & 1 deletion tests/test_obj_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
"ondemand_k8s_cluster",
"ondemand_k8s_docker_cluster",
"ondemand_aws_https_cluster_with_auth",
"multinode_cpu_cluster",
"multinode_cpu_docker_conda_cluster",
"static_cpu_pwd_cluster",
]
}
Expand Down
2 changes: 1 addition & 1 deletion tests/test_resources/test_clusters/test_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ class TestCluster(tests.test_resources.test_resource.TestResource):
"docker_cluster_pk_ssh_den_auth",
"docker_cluster_pwd_ssh_no_auth",
"static_cpu_pwd_cluster",
"multinode_cpu_cluster",
"multinode_cpu_docker_conda_cluster",
]
}

Expand Down
57 changes: 32 additions & 25 deletions tests/test_resources/test_clusters/test_multinode_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,23 @@

class TestMultiNodeCluster:
@pytest.mark.level("release")
def test_rsync_and_ssh_onto_worker_node(self, multinode_cpu_cluster):
worker_node = multinode_cpu_cluster.ips[-1]
def test_rsync_and_ssh_onto_worker_node(self, multinode_cpu_docker_conda_cluster):
worker_node = multinode_cpu_docker_conda_cluster.ips[-1]
local_rh_package_path = Path(importlib.util.find_spec("runhouse").origin).parent

local_rh_package_path = local_rh_package_path.parent
dest_path = f"~/{local_rh_package_path.name}"

# Rsync Runhouse package onto the worker node
multinode_cpu_cluster.rsync(
multinode_cpu_docker_conda_cluster.rsync(
source=str(local_rh_package_path),
dest=dest_path,
up=True,
node=worker_node,
contents=True,
)

status_codes = multinode_cpu_cluster.run(
status_codes = multinode_cpu_docker_conda_cluster.run(
[f"ls -l {dest_path}"], node=worker_node
)
assert status_codes[0][0] == 0
Expand All @@ -34,11 +34,13 @@ def test_rsync_and_ssh_onto_worker_node(self, multinode_cpu_cluster):

@pytest.mark.level("release")
def test_ray_started_on_worker_node_after_cluster_restart(
self, multinode_cpu_cluster
self, multinode_cpu_docker_conda_cluster
):
head_node = multinode_cpu_cluster.ips[0]
head_node = multinode_cpu_docker_conda_cluster.ips[0]

status_codes = multinode_cpu_cluster.run(["ray status"], node=head_node)
status_codes = multinode_cpu_docker_conda_cluster.run(
["ray status"], node=head_node
)
assert status_codes[0][0] == 0

status_output = status_codes[0][1]
Expand All @@ -47,72 +49,77 @@ def test_ray_started_on_worker_node_after_cluster_restart(
assert num_nodes == 2

@pytest.mark.level("release")
def test_send_envs_to_specific_worker_node(self, multinode_cpu_cluster):
def test_send_envs_to_specific_worker_node(
self, multinode_cpu_docker_conda_cluster
):

env_0 = rh.env(
name="worker_env_0",
reqs=["langchain", "pytest"],
).to(multinode_cpu_cluster, node_idx=0)
).to(multinode_cpu_docker_conda_cluster, node_idx=0)

env_1 = rh.env(
name="worker_env_1",
reqs=["torch", "pytest"],
).to(multinode_cpu_cluster, node_idx=1)
).to(multinode_cpu_docker_conda_cluster, node_idx=1)

env_2 = rh.env(
name="worker_env_2",
reqs=["transformers", "pytest"],
)

with pytest.raises(ValueError):
env_2.to(multinode_cpu_cluster, node_idx=len(multinode_cpu_cluster.ips))
env_2.to(
multinode_cpu_docker_conda_cluster,
node_idx=len(multinode_cpu_docker_conda_cluster.ips),
)

env_2.to(multinode_cpu_cluster, node_idx=1)
env_2.to(multinode_cpu_docker_conda_cluster, node_idx=1)

get_pid_0 = rh.function(get_pid_and_ray_node).to(
name="get_pid_0", system=multinode_cpu_cluster, env=env_0
name="get_pid_0", system=multinode_cpu_docker_conda_cluster, env=env_0
)
get_pid_1 = rh.function(get_pid_and_ray_node).to(
name="get_pid_1", system=multinode_cpu_cluster, env=env_1
name="get_pid_1", system=multinode_cpu_docker_conda_cluster, env=env_1
)
get_pid_2 = rh.function(get_pid_and_ray_node).to(
name="get_pid_2", system=multinode_cpu_cluster, env=env_2
name="get_pid_2", system=multinode_cpu_docker_conda_cluster, env=env_2
)
assert get_pid_0()[1] != get_pid_1()[1]
assert get_pid_1()[1] == get_pid_2()[1]

@pytest.mark.level("release")
def test_specifying_resources(self, multinode_cpu_cluster):
def test_specifying_resources(self, multinode_cpu_docker_conda_cluster):
env0 = rh.env(
name="worker_env_0",
compute={"CPU": 1.75},
).to(multinode_cpu_cluster)
).to(multinode_cpu_docker_conda_cluster)

env1 = rh.env(
name="worker_env_1",
compute={"CPU": 0.5},
).to(multinode_cpu_cluster)
).to(multinode_cpu_docker_conda_cluster)

env2 = rh.env(
name="worker_env_2",
compute={"memory": 4 * 1024 * 1024 * 1024},
).to(multinode_cpu_cluster)
).to(multinode_cpu_docker_conda_cluster)

env3 = rh.env(
name="worker_env_3",
compute={"CPU": 0.1, "memory": 2 * 1024 * 1024 * 1024},
).to(multinode_cpu_cluster)
).to(multinode_cpu_docker_conda_cluster)

status = multinode_cpu_cluster.status()
status = multinode_cpu_docker_conda_cluster.status()

env0_node = status["env_servlet_processes"][env0.name]["node_ip"]
env1_node = status["env_servlet_processes"][env1.name]["node_ip"]
env2_node = status["env_servlet_processes"][env2.name]["node_ip"]
env3_node = status["env_servlet_processes"][env3.name]["node_ip"]
assert env0_node in multinode_cpu_cluster.internal_ips
assert env1_node in multinode_cpu_cluster.internal_ips
assert env2_node in multinode_cpu_cluster.internal_ips
assert env3_node in multinode_cpu_cluster.internal_ips
assert env0_node in multinode_cpu_docker_conda_cluster.internal_ips
assert env1_node in multinode_cpu_docker_conda_cluster.internal_ips
assert env2_node in multinode_cpu_docker_conda_cluster.internal_ips
assert env3_node in multinode_cpu_docker_conda_cluster.internal_ips

assert env0_node != env1_node # Too much CPU
assert env2_node != env3_node # Too much memory
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class TestOnDemandCluster(tests.test_resources.test_clusters.test_cluster.TestCl
"k80_gpu_cluster",
"a10g_gpu_cluster",
"static_cpu_pwd_cluster",
"multinode_cpu_cluster",
"multinode_cpu_docker_conda_cluster",
"multinode_gpu_cluster",
]
}
Expand Down
2 changes: 1 addition & 1 deletion tests/test_resources/test_envs/test_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ class TestEnv(tests.test_resources.test_resource.TestResource):
"ondemand_k8s_docker_cluster",
"ondemand_aws_https_cluster_with_auth",
"static_cpu_pwd_cluster",
"multinode_cpu_cluster",
"multinode_cpu_docker_conda_cluster",
"docker_cluster_pk_ssh_no_auth",
"docker_cluster_pwd_ssh_no_auth",
"docker_cluster_pk_ssh_den_auth",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_resources/test_secrets/test_secret.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class TestSecret(tests.test_resources.test_resource.TestResource):
"ondemand_k8s_docker_cluster",
"ondemand_aws_https_cluster_with_auth",
"static_cpu_pwd_cluster",
"multinode_cpu_cluster",
"multinode_cpu_docker_conda_cluster",
"docker_cluster_pk_ssh_no_auth",
"docker_cluster_pwd_ssh_no_auth",
"docker_cluster_pk_ssh_den_auth",
Expand Down

0 comments on commit 4b5a6b3

Please sign in to comment.