From 8a58877c81173181e634431bc208e35f3e82aeed Mon Sep 17 00:00:00 2001 From: Josh Lewittes Date: Fri, 15 Mar 2024 14:48:52 +0200 Subject: [PATCH] tgi inference example on ec2 (#585) --- .../tgi-inference-aws-ec2/requirements.txt | 2 + .../tgi-inference-aws-ec2/tgi_mistral_ec2.py | 228 ++++++++++++++++++ 2 files changed, 230 insertions(+) create mode 100644 examples/tgi-inference-aws-ec2/requirements.txt create mode 100644 examples/tgi-inference-aws-ec2/tgi_mistral_ec2.py diff --git a/examples/tgi-inference-aws-ec2/requirements.txt b/examples/tgi-inference-aws-ec2/requirements.txt new file mode 100644 index 000000000..2337922d2 --- /dev/null +++ b/examples/tgi-inference-aws-ec2/requirements.txt @@ -0,0 +1,2 @@ +openai +runhouse[sky] diff --git a/examples/tgi-inference-aws-ec2/tgi_mistral_ec2.py b/examples/tgi-inference-aws-ec2/tgi_mistral_ec2.py new file mode 100644 index 000000000..32bee039f --- /dev/null +++ b/examples/tgi-inference-aws-ec2/tgi_mistral_ec2.py @@ -0,0 +1,228 @@ +# # Deploy Mistral's 7B TGI Model with AWS EC2 + +# This example demonstrates how to deploy a +# [TGI model](https://huggingface.co/docs/text-generation-inference/messages_api) on AWS EC2 using Runhouse. +# This example draws inspiration from +# [Huggingface's tutorial on AWS SageMaker](https://huggingface.co/blog/text-generation-inference-on-inferentia2). +# Zephyr is a 7B fine-tuned version of [Mistral's 7B-v0.1 model](https://huggingface.co/mistralai/Mistral-7B-v0.1). +# +# ## Setup credentials and dependencies +# ``` +# Install the required dependencies: +# ```shell +# $ pip install -r requirements.txt +# ``` +# +# We'll be launching an AWS EC2 instance via SkyPilot, so we need to make sure our AWS credentials are set up +# with SkyPilot: +# ```shell +# $ aws configure +# $ sky check +# ``` +# +# ## Setting up a model class +# We import runhouse and openai, the required libraries to have installed locally: + +import time +from pathlib import Path + +import runhouse as rh +from openai import OpenAI + + +# Next, we define a class that will hold the model and allow us to send prompts to it. +# You'll notice this class inherits from `rh.Module`. +# This is a Runhouse class that allows you to +# run code in your class on a remote machine. +# +# Learn more in the [Runhouse docs on functions and modules](/docs/tutorials/api-modules). +class TGIInference(rh.Module): + def __init__( + self, + model_id="teknium/OpenHermes-2.5-Mistral-7B", + image_uri="ghcr.io/huggingface/text-generation-inference:1.4", + ): + super().__init__() + self.docker_client = None + + self.model_id = model_id + self.image_uri = image_uri + + self.container_port = 8080 + self.container_name = "text-generation-service" + + def _load_docker_client(self): + import docker + + self.docker_client = docker.from_env() + + def _model_is_deployed(self): + if self.docker_client is None: + self._load_docker_client() + + containers = self.docker_client.containers.list( + filters={"name": self.container_name} + ) + + return bool(containers) + + def deploy(self): + # Adapted from: https://huggingface.co/docs/text-generation-inference/quicktour + import docker + + if self._model_is_deployed(): + return + + print("Model has not yet been deployed, loading image and running container.") + + home_dir = str(Path.home()) + data_volume_path = f"{home_dir}/data" + + device_request = docker.types.DeviceRequest( + count=-1, + capabilities=[["gpu"]], + ) + + start_time = time.time() + timeout = 300 + + container = self.docker_client.containers.run( + self.image_uri, + name=self.container_name, + detach=True, + ports={"80/tcp": self.container_port}, + volumes={data_volume_path: {"bind": "/data", "mode": "rw"}}, + command="--model-id " + self.model_id, + device_requests=[device_request], + shm_size="1g", + ) + + print("Container started, waiting for model to load.") + + # Wait for model to load inside the container + for line in container.logs(stream=True): + current_time = time.time() + elapsed_time = current_time - start_time + + log_line = line.strip().decode("utf-8") + if "Connected" in log_line: + print("Finished loading model, endpoint is ready.") + break + + if elapsed_time > timeout: + print(f"Failed to load model within {timeout} seconds. Exiting.") + break + + def restart_container(self): + if self.docker_client is None: + self._load_docker_client() + + try: + container = self.docker_client.containers.get(self.container_name) + container.stop() + container.remove() + except Exception as e: + raise RuntimeError(f"Failed to stop or remove container: {e}") + + # Deploy a new container + self.deploy() + + # --------------------------------------------------- + + +# ## Setting up Runhouse primitives +# +# Now, we define the main function that will run locally when we run this script, and set up +# our Runhouse module on a remote cluster. First, we create a cluster with the desired instance type and provider. +# Our `instance_type` here is defined as `g5.4xlarge`, which is +# an [AWS instance type on EC2](https://aws.amazon.com/ec2/instance-types/g5/) with a GPU. +# (For this model we'll need a GPU and at least 16GB of RAM) +# We also open port 8080, which is the port that the TGI model will be running on. +# +# Learn more in the [Runhouse docs on clusters](/docs/tutorials/api-clusters). +# +# NOTE: Make sure that your code runs within a `if __name__ == "__main__":` block, as shown below. Otherwise, +# the script code will run when Runhouse attempts to run code remotely. +if __name__ == "__main__": + port = 8080 + cluster = rh.cluster( + name="rh-g5-4xlarge", + instance_type="g5.4xlarge", + provider="aws", + open_ports=[port], + ).up_if_not() + + # Next, we define the environment for our module. This includes the required dependencies that need + # to be installed on the remote machine, as well as any secrets that need to be synced up from local to remote. + # + # Learn more in the [Runhouse docs on envs](/docs/tutorials/api-envs). + env = rh.env( + name="tgi_env", + reqs=["docker", "openai", "torch", "transformers"], + working_dir="./", + ) + + # Finally, we define our module and run it on the remote cluster. We construct it normally and then call + # `get_or_to` to run it on the remote cluster. Using `get_or_to` allows us to load the exiting Module + # by the name `tgi_inference` if it was already put on the cluster. If we want to update the module each + # time we run this script, we can use `to` instead of `get_or_to`. + # + # Note that we also pass the `env` object to the `get_or_to` method, which will ensure that the environment is + # set up on the remote machine before the module is run. + remote_tgi_model = TGIInference().get_or_to(cluster, env=env, name="tgi-inference") + + # ## Sharing an inference endpoint + # We can publish this module for others to use: + # remote_tgi_model.share(visibility="public") + + # Alternatively we can share with specific users: + # remote_tgi_model.share(["user1@gmail.com", "user2@gmail.com"], access_level="read") + + # Note: For more info on fine-grained access controls, see the + # [Runhouse docs on sharing](https://www.run.house/docs/tutorials/quick-start-den#sharing). + + # ## Deploying the model + # We can call the `deploy` method on the model class instance if it were running locally. + # This will load and run the model on the remote cluster. + # We only need to do this setup step once, as further calls will use the existing docker container deployed + # on the cluster and maintain state between calls + remote_tgi_model.deploy() + + # ## Sending a prompt to the model + prompt_messages = [ + {"role": "user", "content": "What is your favourite condiment?"}, + { + "role": "assistant", + "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount " + "of zesty flavour to whatever I'm cooking up in the kitchen!", + }, + {"role": "user", "content": "Do you have mayonnaise recipes?"}, + ] + + # We'll use the Messages API to send the prompt to the model + # See [here](https://huggingface.co/docs/text-generation-inference/messages_api#streaming) for more info + # on the Messages API, and using the OpenAI python client + base_url = f"http://{cluster.address}:{port}/v1" + + # Initialize the OpenAI client + client = OpenAI(base_url=base_url, api_key="-") + + # Call the model with the prompt messages + chat_completion = client.chat.completions.create( + model="tgi", messages=prompt_messages, stream=False + ) + print(chat_completion) + + # For streaming results, set `stream=True` and iterate over the results: + # for message in chat_completion: + # print(message) + + # Alternatively, we can also call the model via HTTP + print("------------") + print("To call the model via HTTP, use the following cURL command:") + print( + f"curl http://{cluster.address}:{port}/v1/chat/completions -X POST -d '" + '{"model": "tgi", "stream": false, "messages": [{"role": "system", "content": "You are a helpful assistant."},' + '{"role": "user", "content": "What is deep learning?"}]}' + "' -H 'Content-Type: application/json'" + )