Skip to content

Commit

Permalink
llama2 on inferentia with tgi example
Browse files Browse the repository at this point in the history
  • Loading branch information
jlewitt1 committed Mar 21, 2024
1 parent 91eafef commit ebdc422
Show file tree
Hide file tree
Showing 3 changed files with 303 additions and 0 deletions.
21 changes: 21 additions & 0 deletions examples/llama2-with-tgi-aws-inferentia2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Deploy Llama 2 7B Model with TGI on AWS Inferentia

See a more [rich explanation](https://www.run.house/examples/llama-tgi-inference-on-aws-inferentia)
of this example on our site.

This example demonstrates how to deploy a [Llama 7B model](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) using
[TGI](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on AWS Inferentia
using Runhouse, specifically with the [AWS Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/).

## Setup credentials and dependencies
Install the required dependencies:
```shell
$ pip install -r requirements.txt
```

We'll be launching an AWS Inferentia instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to
make sure our AWS credentials are set up:
```shell
$ aws configure
$ sky check
```
1 change: 1 addition & 0 deletions examples/llama2-with-tgi-aws-inferentia2/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
runhouse[aws]
281 changes: 281 additions & 0 deletions examples/llama2-with-tgi-aws-inferentia2/tgi_llama2_inferentia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
# # Deploy Llama2 7B Model with TGI on AWS Inferentia2
# This example demonstrates how to deploy a
# [TGI model](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on AWS Inferentia2
# using Runhouse, specifically with the [AWS Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/).
#
# ## Setup credentials and dependencies
# Install the required dependencies:
# ```shell
# $ pip install -r requirements.txt
# ```
#
# We'll be using [Llama2](https://huggingface.co/aws-neuron/Llama-2-7b-hf-neuron-budget), which is a gated
# model and requires a Hugging Face token in order to access it.
#
# To set up your Hugging Face token, run the following command in your local terminal:
# ```shell
# $ export HF_TOKEN=<your huggingface token>
# ```
#
# We'll be launching an AWS EC2 instance via [SkyPilot](https://github.com/skypilot-org/skypilot), so we need to make
# sure our AWS credentials are set up with SkyPilot:
# ```shell
# $ aws configure
# $ sky check
# ```
#
# ## Setting up a model class
import time
from pathlib import Path

import requests

# We import runhouse, the only required library to have installed locally:
import runhouse as rh


# Next, we define a class that will hold the model and allow us to send prompts to it.
# You'll notice this class inherits from `rh.Module`.
# This is a Runhouse class that allows you to
# run code in your class on a remote machine.
#
# Learn more in the [Runhouse docs on functions and modules](/docs/tutorials/api-modules).
class TGIInference(rh.Module):
def __init__(
self,
model_id="aws-neuron/Llama-2-7b-hf-neuron-budget",
image_uri="ghcr.io/huggingface/neuronx-tgi:latest",
max_batch_size=2,
max_input_length=1024,
max_batch_prefill_tokens=1024,
max_total_tokens=2048,
**model_kwargs,
):
super().__init__()
self.docker_client = None

self.model_id = model_id
self.image_uri = image_uri
self.max_batch_size = max_batch_size
self.max_input_length = max_input_length
self.max_total_tokens = max_total_tokens
self.max_batch_prefill_tokens = max_batch_prefill_tokens
self.model_kwargs = model_kwargs

self.container_port = 8080
self.container_name = "text-generation-service"

def _load_docker_client(self):
import docker

self.docker_client = docker.from_env()

def _run_container(self):
print("Running container")
# https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference#using-a-neuron-model-from-the--huggingface-hub-recommended
model_cmd = (
f"--model-id {self.model_id} "
f"--max-batch-size {self.max_batch_size} "
f"--max-input-length {self.max_input_length} "
f"--max-total-tokens {self.max_total_tokens} "
f"--max-batch-prefill-tokens {self.max_batch_prefill_tokens}"
)

# Add any other model kwargs to the command
# https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference#choosing-service-parameters
for key, value in self.model_kwargs.items():
model_cmd += f" --{key} {value}"

# shared volume mounted at /data to cache the models
home_dir = str(Path.home())
data_volume_path = f"{home_dir}/data"

start_time = time.time()
timeout = 600

# Load the HF token which was synced onto the cluster as part of the env setup
hf_secret = rh.secret(provider="huggingface")
hf_token = hf_secret.values.get("token")

# Note: device we've chosen has one neuron device, but we can specify multiple devices if relevant
container = self.docker_client.containers.run(
image=self.image_uri,
name=self.container_name,
detach=True,
ports={"80/tcp": self.container_port},
volumes={data_volume_path: {"bind": "/data", "mode": "rw"}},
environment={"HF_TOKEN": hf_token},
devices=["/dev/neuron0"],
command=model_cmd,
)

print("Container started, waiting for model to load.")

# Wait for model to load inside the container
for line in container.logs(stream=True):
current_time = time.time()
elapsed_time = current_time - start_time

log_line = line.strip().decode("utf-8")
if "Connected" in log_line:
print("Finished loading model, endpoint is ready.")
break

if elapsed_time > timeout:
print(f"Failed to load model within {timeout} seconds. Exiting.")
break

def _model_is_deployed(self):
if self.docker_client is None:
self._load_docker_client()

containers = self.docker_client.containers.list(
filters={"name": self.container_name}
)
return bool(containers)

def deploy(self):
if self._model_is_deployed():
return

print("Model has not yet been deployed, loading image and running container.")

self._run_container()

def restart_container(self):
if self.docker_client is None:
self._load_docker_client()

try:
container = self.docker_client.containers.get(self.container_name)
container.stop()
container.remove()
except Exception as e:
raise RuntimeError(f"Failed to stop or remove container: {e}")

# Deploy a new container
self.deploy()


# ## Setting up Runhouse primitives
#
# Now, we define the main function that will run locally when we run this script, and set up
# our Runhouse module on a remote cluster. First, we create a cluster with the desired instance type and provider.
# Our `instance_type` here is defined as `inf2.8xlarge`, which is
# an [AWS inferentia instance type on EC2](https://aws.amazon.com/ec2/instance-types/inf2/).
#
# We also open port 8080, which is the port that the TGI model will be running on.
#
# Learn more about clusters in the [Runhouse docs](/docs/tutorials/api-clusters).
#
# NOTE: Make sure that your code runs within a `if __name__ == "__main__":` block, as shown below. Otherwise,
# the script code will run when Runhouse attempts to run code remotely.
if __name__ == "__main__":
port = 8080
cluster = rh.cluster(
name="rh-inf2-8xlarge",
instance_type="inf2.8xlarge",
image_id="ami-0d608117d5f482a2f",
region="us-east-1",
disk_size=512,
provider="aws",
open_ports=[port],
).up_if_not()

# We can run commands directly on the cluster via `cluster.run()`. Here, we set up the environment for our
# upcoming environment (more on that below) that installed some AWS-neuron specific libraries.
# We install the `transformers-neuronx` library before the env is set up in order to avoid
# [common errors](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/training-troubleshooting.html):
cluster.run(
[
"pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com",
]
)

# Next, we define the environment for our module. This includes the required dependencies that need
# to be installed on the remote machine, as well as any secrets that need to be synced up from local to remote.
#
# Passing `huggingface` to the `secrets` parameter will load the Hugging Face token we set up earlier. This is
# needed to download the model from the Hugging Face model hub. Runhouse will handle saving the token down
# on the cluster in the default Hugging Face token location (`~/.cache/huggingface/token`).
#
# Learn more in the [Runhouse docs on envs](/docs/tutorials/api-envs).
env = rh.env(
name="tgi_env",
reqs=["docker"],
secrets=["huggingface"],
working_dir="./",
)

# Finally, we define our module and run it on the remote cluster. We construct it normally and then call
# `get_or_to` to run it on the remote cluster. Using `get_or_to` allows us to load the exiting Module
# by the name `tgi_inference` if it was already put on the cluster. If we want to update the module each
# time we run this script, we can use `to` instead of `get_or_to`.
#
# Note that we also pass the `env` object to the `get_or_to` method, which will ensure that the environment is
# set up on the remote machine before the module is run.
remote_tgi_model = TGIInference().get_or_to(cluster, env=env, name="tgi-inference")

# ## Sharing an inference endpoint
# We can publish this module for others to use:
# ```python
# remote_tgi_model.share(visibility="public")
# ```

# Alternatively we can share with specific users:
# ```python
# remote_tgi_model.share(["user1@gmail.com", "user2@gmail.com"], access_level="read")
# ```

# Note: For more info on fine-grained access controls, see the
# [Runhouse docs on sharing](https://www.run.house/docs/tutorials/quick-start-den#sharing).

# ## Deploying the model
# We can call the `deploy` method on the model class instance if it were running locally.
# This will load and run the model on the remote cluster.
# We only need to do this setup step once, as further calls will use the existing docker container deployed
# on the cluster and maintain state between calls:
remote_tgi_model.deploy()

# ## Sending a prompt to the model
prompt_message = "What is Deep Learning?"

# We'll use the Messages API to send the prompt to the model.
# See [here](https://huggingface.co/docs/text-generation-inference/messages_api#streaming) for more info
# on using the Messages API

# Call the model with the prompt messages:
# Note: We can also update some of the [default parameters](https://huggingface.github.io/text-generation-inference/#/Text%20Generation%20Inference/generate)
data = {
"inputs": prompt_message,
"parameters": {
"max_new_tokens": 50,
"temperature": 0.9,
"top_p": 0.92,
"do_sample": True,
"repetition_penalty": 1.2,
"frequency_penalty": 1.0,
},
}
headers = {
"Content-Type": "application/json",
}

response = requests.post(
f"http://{cluster.address}:{port}/generate", headers=headers, json=data
)
print(response.json())

# For streaming results, use the `/generate_stream` endpoint and iterate over the results:
# ```python
# for message in resp:
# print(message)
# ```

# Alternatively, we can also call the model via HTTP
# Note: We can also use a streaming route by replacing `generate` with `generate_stream`:
print(
f"curl http://{cluster.address}:{port}/generate -X POST -d '"
f'{{"inputs":"{prompt_message}","parameters":{{"max_new_tokens":20}}}}'
"' -H 'Content-Type: application/json'"
)

0 comments on commit ebdc422

Please sign in to comment.