Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MOD-3251] Stop heartbeats before sending the container snapshot RPC #2004

Merged
merged 28 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
f3c04c6
Stop heartbeats before snapshotting
mattnappo Jul 12, 2024
41adde8
Merge branch 'main' into matt/gvisor-flake-fix
thundergolfer Jul 12, 2024
4cceb72
Added asyncio.Event for pausing heartbeats
mattnappo Jul 12, 2024
2cdcd60
Merge branch 'matt/gvisor-flake-fix' of github.com:modal-labs/modal-c…
mattnappo Jul 12, 2024
626af1c
Use asyncio.Condition to ensure mutual exclusion
mattnappo Jul 12, 2024
7228652
Fixed unit tests
mattnappo Jul 12, 2024
b0bef65
Merge branch 'main' of github.com:modal-labs/modal-client into matt/g…
mattnappo Jul 15, 2024
86b8fd5
Fixed bug, but now its slow
mattnappo Jul 15, 2024
4c6ed72
Fixed bottleneck
mattnappo Jul 15, 2024
096a399
Remove old logic
mattnappo Jul 15, 2024
bf5400b
Renamed cond var
mattnappo Jul 15, 2024
484e24d
Wrote tests
mattnappo Jul 15, 2024
b6fa509
Wrote better tests
mattnappo Jul 15, 2024
6071891
Try to fix tests
mattnappo Jul 15, 2024
70c49ff
Retry
mattnappo Jul 15, 2024
5b423f5
Undo warning
mattnappo Jul 15, 2024
817709f
Await
mattnappo Jul 15, 2024
3259b2b
Try using async client
mattnappo Jul 15, 2024
3c3209e
Use servicer
mattnappo Jul 15, 2024
a9070ff
Use servicer
mattnappo Jul 15, 2024
153eb0a
Lint
mattnappo Jul 15, 2024
5bb4044
Add sleep in test
mattnappo Jul 15, 2024
91b5665
Reduce sleep time
mattnappo Jul 15, 2024
7e7d692
Merge branch 'main' into matt/gvisor-flake-fix
thundergolfer Jul 16, 2024
74ae84a
Address review
mattnappo Jul 16, 2024
7bbae23
Merge branch 'matt/gvisor-flake-fix' of github.com:modal-labs/modal-c…
mattnappo Jul 16, 2024
c0f5fcb
Run entire restore phase within the lock
mattnappo Jul 16, 2024
87a9036
Merge branch 'main' into matt/gvisor-flake-fix
thundergolfer Jul 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion modal/_container_entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,7 @@ def main(container_args: api_pb2.ContainerArguments, client: Client):

_client: _Client = synchronizer._translate_in(client) # TODO(erikbern): ugly

with container_io_manager.heartbeats(), UserCodeEventLoop() as event_loop:
with container_io_manager.heartbeats(function_def.is_checkpointing_function), UserCodeEventLoop() as event_loop:
mattnappo marked this conversation as resolved.
Show resolved Hide resolved
# If this is a serialized function, fetch the definition from the server
if function_def.definition_type == api_pb2.Function.DEFINITION_TYPE_SERIALIZED:
ser_cls, ser_fun = container_io_manager.get_serialized_function()
Expand Down Expand Up @@ -779,6 +779,7 @@ def main(container_args: api_pb2.ContainerArguments, client: Client):
if function_def.is_checkpointing_function:
container_io_manager.memory_snapshot()


# Install hooks for interactive functions.
if function_def.pty_info.pty_type != api_pb2.PTYInfo.PTY_TYPE_UNSPECIFIED:

Expand Down
11 changes: 2 additions & 9 deletions modal/_container_io_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ def _init(self, container_args: api_pb2.ContainerArguments, client: _Client):
self._waiting_for_memory_snapshot = False
self._heartbeat_loop = None
self._pause_heartbeats = asyncio.Condition()
mattnappo marked this conversation as resolved.
Show resolved Hide resolved
self._snapshot_running = False

self._is_interactivity_enabled = False
self._fetching_inputs = True
Expand Down Expand Up @@ -163,15 +162,13 @@ async def _heartbeat_handle_cancellations(self) -> bool:
request.current_input_started_at = self.current_input_started_at

async with self._pause_heartbeats:
print("heartbeat acquired lock")
while self._snapshot_running:
await self._pause_heartbeats.wait()

# TODO(erikbern): capture exceptions?
response = await retry_transient_errors(
self._client.stub.ContainerHeartbeat, request, attempt_timeout=HEARTBEAT_TIMEOUT
)
print("heartbeat released lock")

if response.HasField("cancel_input_event"):
# Pause processing of the current input by signaling self a SIGUSR1.
Expand Down Expand Up @@ -206,11 +203,11 @@ async def _heartbeat_handle_cancellations(self) -> bool:
return False

@asynccontextmanager
async def heartbeats(self) -> AsyncGenerator[None, None]:
async def heartbeats(self, enable: bool) -> AsyncGenerator[None, None]:
async with TaskContext() as tc:
self._heartbeat_loop = t = tc.create_task(self._run_heartbeat_loop())
self._snapshot_running = False
t.set_name("heartbeat loop")
self._snapshot_running = enable
try:
yield
finally:
Expand Down Expand Up @@ -587,10 +584,8 @@ async def memory_restore(self) -> None:

# Turn heartbeats back on
mattnappo marked this conversation as resolved.
Show resolved Hide resolved
async with self._pause_heartbeats:
print("restore acquired lock")
self._snapshot_running = False
self._pause_heartbeats.notify_all()
print("restore released lock")

logger.debug("Container: restored")

Expand Down Expand Up @@ -644,14 +639,12 @@ async def memory_snapshot(self) -> None:

# Pause heartbeats since they keep the client connection open which causes the snapshotter to crash
async with self._pause_heartbeats:
print("snapshot acquired lock")
self._snapshot_running = True
self._pause_heartbeats.notify_all()

await self._client.stub.ContainerCheckpoint(
api_pb2.ContainerCheckpointRequest(checkpoint_id=self.checkpoint_id)
)
print("snapshot sent request")

self._waiting_for_memory_snapshot = True
await self._client._close(forget_credentials=True)
Expand Down
Loading