diff --git "a/llm/\346\234\215\345\212\241\345\214\226\351\203\250\347\275\262.md" "b/llm/\346\234\215\345\212\241\345\214\226\351\203\250\347\275\262.md" new file mode 100644 index 0000000000..449e5aafc1 --- /dev/null +++ "b/llm/\346\234\215\345\212\241\345\214\226\351\203\250\347\275\262.md" @@ -0,0 +1,154 @@ +# FastDeploy服务化部署 + +### 预安装: + +1.安装python3.8环境 + +2.安装paddlepaddle + +```bash +wget https://bj.bcebos.com/fastdeploy/llm/paddlepaddle_gpu-0.0.0-cp38-cp38-linux_x86_64.whl +pip install paddlepaddle_gpu-0.0.0-cp38-cp38-linux_x86_64.whl +``` + +3.导入PaddleNLP仓库,并安装wheel包以及自定义算子 + +```bash +git clone https://github.com/PaddlePaddle/PaddleNLP.git +cd PaddleNLP +#注意:如果paddlenlp最新版的代码执行期间出现bug,可以执行如下命令切换到之前版本: +#git checkout 111b381146183aa6416343685d6394ee5d126981 +python3 setup.py bdist_wheel +cd dist +pip install $(ls) # wheel包文件 +cd .. +cd csrc +python3 setup_cuda.py install --user #安装自定义算子 +``` + +4.导入FastDeploy仓库,并安装wheel包 + +```bash +git clone -b llm https://github.com/PaddlePaddle/FastDeploy.git +cd FastDeploy/llm +python3 setup.py bdist_wheel +cd dist +pip install $(ls) # wheel包文件 +``` + +### **利用PaddleNLP导出推理模型** + +```bash +export_model_name = "THUDM/chatglm-6b" #这里指定导出的模型名称 +output_model_path = "chatglm-6b-fp16" #这里指定导出模型的路径 +cd PaddleNLP/llm +python3 export_model.py --model_name_or_path ${export_model_name} --output_path ${output_model_path} --dtype float16 --inference_model +``` + +### 模型转换 + +将PaddleNLP导出的模型转换为服务化部署的模型结构 + +```bash +wget https://bj.bcebos.com/fastdeploy/llm/gen_serving_model.sh +serving_model_path= "chatglm-6b-fp16-serving" #这里指定服务化模型存储目录 +# 第一个参数为PaddleNLP导出模型目录,第二个参数为存储服务化模型的目录路径 +bash gen_serving_model.sh ${output_model_path} ${serving_model_path} +``` + +### **部署方式** + +```bash +# 1、拉取docker镜像,创建docker,要求cuda驱动大于520 +docker pull registry.baidubce.com/paddlepaddle/fastdeploy-llm:0.0.9 +# 2.创建容器,挂载模型路径到容器中,进入docker +nvidia-docker run --name 容器名 -v $PWD:/work --network=host --privileged --shm-size=5g -it registry.baidubce.com/paddlepaddle/fastdeploy-llm:0.0.9 /bin/bash + +# 3、进入docker,设置如下环境变量,并且启动triton服务 +export FLAGS_cache_inference_while_scope=1 +export BATCH_SIZE=8 #指定batch_size +export IS_PTUNING=0 #非ptuning模型 +# 配置此环境变量,会将接收到的请求dump到日志,便于后期追查问题 +export ENABLE_DEBUG_LOG=1 + +rm -rf /dev/shm/* #清空共享内存 +ldconfig +#启动服务端服务 +tritonserver --model-repository ${serving_model_path} --http-port 8134 --grpc-port 8135 --metrics-port 8136 +``` + +### **客户端请求示例** +若没安装tritonclient[grpc],请先用 pip install tritonclient[grpc] 安装 +```python +import queue +import json +import sys +from functools import partial + +import numpy as np +import tritonclient.grpc as grpcclient +from tritonclient.utils import * + +class UserData: + def __init__(self): + self._completed_requests = queue.Queue() + +def callback(user_data, result, error): + if error: + user_data._completed_requests.put(error) + else: + user_data._completed_requests.put(result) + +def get_completion(text, model_name, grpc_url): + model_name = model_name + in_value = { + "text": text, + "topp": 0.0, + "temperature": 1.0, + "max_dec_len": 1024, + "min_dec_len": 2, + "penalty_score": 1.0, + "frequency_score": 0.99, + "eos_token_id": 2, + "model_id": "test", + "presence_score": 0.0 + } + inputs = [grpcclient.InferInput("IN", [1], np_to_triton_dtype(np.object_))] + outputs = [grpcclient.InferRequestedOutput("OUT")] + user_data = UserData() + completion = "" + + is_error_request = False # 判断query是否处理失败 + + with grpcclient.InferenceServerClient(url=grpc_url, verbose=False) as triton_client: + triton_client.start_stream(callback=partial(callback, user_data)) + in_data = np.array([json.dumps(in_value)], dtype=np.object_) + inputs[0].set_data_from_numpy(in_data) + triton_client.async_stream_infer(model_name=model_name, inputs=inputs, request_id="0", outputs=outputs) + while True: + data_item = user_data._completed_requests.get(timeout=300) + if type(data_item) == InferenceServerException: + print('Exception:', 'status', data_item.status(), 'msg', data_item.message()) + is_error_request = True + break + else: + results = data_item.as_numpy("OUT")[0] + data = json.loads(results) + + completion += data["result"] + if data.get("is_end", False): + break + return completion + +grpc_url = "0.0.0.0:8135" +model_name = "model-aistudio" # 上述服务化模型,服务都均已命名为model-aistudio +result = get_completion("Hello, how are you", model_name, grpc_url) +``` + +### **结束进程** + +```bash +ps aux | grep tritonserver | awk '{print $2}' | xargs kill -9 +ps aux | grep python3 | awk '{print $2}' | xargs kill -9 +rm -rf /dev/shm* +```