Fix vLLM and vLLM-on-Ray UT bug (#580)
Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> Co-authored-by: chen, suyue <suyue.chen@intel.com>
This commit is contained in:
@@ -173,9 +173,9 @@ export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
|
||||
export LLM_MODEL_ID_NAME="neural-chat-7b-v3-3"
|
||||
export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:8090"
|
||||
export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808"
|
||||
export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
|
||||
export vLLM_LLM_ENDPOINT="http://${host_ip}:8008"
|
||||
export vLLM_RAY_LLM_ENDPOINT="http://${host_ip}:8008"
|
||||
export TGI_LLM_ENDPOINT="http://${host_ip}:8005"
|
||||
export vLLM_LLM_ENDPOINT="http://${host_ip}:8007"
|
||||
export vLLM_RAY_LLM_ENDPOINT="http://${host_ip}:8006"
|
||||
export LLM_SERVICE_PORT=9000
|
||||
export REDIS_URL="redis://${host_ip}:6379"
|
||||
export INDEX_NAME="rag-redis"
|
||||
@@ -296,7 +296,7 @@ curl http://${host_ip}:8000/v1/reranking \
|
||||
|
||||
```bash
|
||||
#TGI Service
|
||||
curl http://${host_ip}:8008/generate \
|
||||
curl http://${host_ip}:8005/generate \
|
||||
-X POST \
|
||||
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
|
||||
-H 'Content-Type: application/json'
|
||||
@@ -304,7 +304,7 @@ curl http://${host_ip}:8008/generate \
|
||||
|
||||
```bash
|
||||
#vLLM Service
|
||||
curl http://${host_ip}:8008/v1/completions \
|
||||
curl http://${host_ip}:8007/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "${LLM_MODEL_ID}",
|
||||
@@ -316,7 +316,7 @@ curl http://${host_ip}:8008/v1/completions \
|
||||
|
||||
```bash
|
||||
#vLLM-on-Ray Service
|
||||
curl http://${host_ip}:8008/v1/chat/completions \
|
||||
curl http://${host_ip}:8006/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"model": "${LLM_MODEL_ID}", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
|
||||
```
|
||||
|
||||
@@ -114,7 +114,7 @@ services:
|
||||
image: ghcr.io/huggingface/tgi-gaudi:2.0.1
|
||||
container_name: tgi-gaudi-server
|
||||
ports:
|
||||
- "8008:80"
|
||||
- "8005:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
environment:
|
||||
|
||||
@@ -112,7 +112,7 @@ services:
|
||||
image: opea/llm-vllm-hpu:latest
|
||||
container_name: vllm-gaudi-server
|
||||
ports:
|
||||
- "8008:80"
|
||||
- "8007:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
environment:
|
||||
@@ -122,12 +122,12 @@ services:
|
||||
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
HABANA_VISIBLE_DEVICES: all
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
LLM_MODEL: ${LLM_MODEL_ID}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
runtime: habana
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
ipc: host
|
||||
command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80"
|
||||
command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"
|
||||
llm:
|
||||
image: opea/llm-vllm:latest
|
||||
container_name: llm-vllm-gaudi-server
|
||||
|
||||
@@ -112,7 +112,7 @@ services:
|
||||
image: opea/llm-vllm-ray-hpu:latest
|
||||
container_name: vllm-ray-gaudi-server
|
||||
ports:
|
||||
- "8008:8000"
|
||||
- "8006:8000"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
environment:
|
||||
@@ -122,12 +122,12 @@ services:
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
HABANA_VISIBLE_DEVICES: all
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
LLM_MODEL: ${LLM_MODEL_ID}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
runtime: habana
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
ipc: host
|
||||
command: /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL --tensor_parallel_size 2 --enforce_eager True"
|
||||
command: /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL_ID --tensor_parallel_size 2 --enforce_eager True"
|
||||
llm:
|
||||
image: opea/llm-vllm-ray:latest
|
||||
container_name: llm-vllm-ray-gaudi-server
|
||||
|
||||
@@ -50,7 +50,7 @@ function start_services() {
|
||||
export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
|
||||
export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
|
||||
export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
|
||||
export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
|
||||
export TGI_LLM_ENDPOINT="http://${ip_address}:8005"
|
||||
export REDIS_URL="redis://${ip_address}:6379"
|
||||
export REDIS_HOST=${ip_address}
|
||||
export INDEX_NAME="rag-redis"
|
||||
@@ -215,7 +215,7 @@ function validate_microservices() {
|
||||
|
||||
# tgi for llm service
|
||||
validate_service \
|
||||
"${ip_address}:8008/generate" \
|
||||
"${ip_address}:8005/generate" \
|
||||
"generated_text" \
|
||||
"tgi-llm" \
|
||||
"tgi-gaudi-server" \
|
||||
|
||||
@@ -50,7 +50,8 @@ function start_services() {
|
||||
export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
|
||||
export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
|
||||
export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
|
||||
export vLLM_LLM_ENDPOINT="http://${ip_address}:8008"
|
||||
export vLLM_LLM_ENDPOINT="http://${ip_address}:8007"
|
||||
export LLM_SERVICE_PORT=9000
|
||||
export REDIS_URL="redis://${ip_address}:6379"
|
||||
export INDEX_NAME="rag-redis"
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
@@ -79,12 +80,13 @@ function start_services() {
|
||||
# Start Docker Containers
|
||||
docker compose -f compose_vllm.yaml up -d
|
||||
n=0
|
||||
until [[ "$n" -ge 180 ]]; do
|
||||
until [[ "$n" -ge 25 ]]; do
|
||||
echo "n=$n"
|
||||
docker logs vllm-gaudi-server > vllm_service_start.log
|
||||
if grep -q Connected vllm_service_start.log; then
|
||||
if grep -q "Warmup finished" vllm_service_start.log; then
|
||||
break
|
||||
fi
|
||||
sleep 1s
|
||||
sleep 20s
|
||||
n=$((n+1))
|
||||
done
|
||||
}
|
||||
@@ -165,7 +167,7 @@ function validate_microservices() {
|
||||
|
||||
# vllm for llm service
|
||||
validate_services \
|
||||
"${ip_address}:8008/v1/completions" \
|
||||
"${ip_address}:8007/v1/completions" \
|
||||
"text" \
|
||||
"vllm-llm" \
|
||||
"vllm-gaudi-server" \
|
||||
@@ -185,7 +187,7 @@ function validate_megaservice() {
|
||||
# Curl the Mega Service
|
||||
validate_services \
|
||||
"${ip_address}:8888/v1/chatqna" \
|
||||
"billion" \
|
||||
"data:" \
|
||||
"mega-chatqna" \
|
||||
"chatqna-gaudi-backend-server" \
|
||||
'{"messages": "What is the revenue of Nike in 2023?"}'
|
||||
@@ -26,16 +26,15 @@ function build_docker_images() {
|
||||
cd $WORKPATH/docker/ui
|
||||
docker build --no-cache -t opea/chatqna-ui:latest -f docker/Dockerfile .
|
||||
|
||||
# cd $WORKPATH
|
||||
# git clone https://github.com/vllm-project/vllm.git
|
||||
# cd vllm
|
||||
# docker build --no-cache -t opea/vllm:latest -f Dockerfile.cpu .
|
||||
# cd $WORKPATH
|
||||
# git clone https://github.com/vllm-project/vllm.git
|
||||
# cd vllm
|
||||
# docker build --no-cache -t opea/vllm:latest -f Dockerfile.cpu .
|
||||
|
||||
docker images
|
||||
}
|
||||
|
||||
function start_services() {
|
||||
# build vllm for each test instead of pull from local registry
|
||||
cd $WORKPATH
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
@@ -73,18 +72,19 @@ function start_services() {
|
||||
sed -i "s#image: opea/chatqna-ui:latest#image: opea/chatqna-ui:${IMAGE_TAG}#g" compose_vllm.yaml
|
||||
sed -i "s#image: opea/chatqna-conversation-ui:latest#image: opea/chatqna-conversation-ui:${IMAGE_TAG}#g" compose_vllm.yaml
|
||||
sed -i "s#image: opea/*#image: ${IMAGE_REPO}opea/#g" compose_vllm.yaml
|
||||
sed -i "s#image: ${IMAGE_REPO}opea/vllm:latest#image: opea/vllm:latest#g" compose_vllm.yaml
|
||||
fi
|
||||
fi
|
||||
|
||||
# Start Docker Containers
|
||||
docker compose -f compose_vllm.yaml up -d
|
||||
n=0
|
||||
until [[ "$n" -ge 100 ]]; do
|
||||
until [[ "$n" -ge 10 ]]; do
|
||||
docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log
|
||||
if grep -q Connected ${LOG_PATH}/vllm_service_start.log; then
|
||||
break
|
||||
fi
|
||||
sleep 1s
|
||||
sleep 10s
|
||||
n=$((n+1))
|
||||
done
|
||||
}
|
||||
@@ -185,7 +185,7 @@ function validate_megaservice() {
|
||||
# Curl the Mega Service
|
||||
validate_services \
|
||||
"${ip_address}:8888/v1/chatqna" \
|
||||
"billion" \
|
||||
"data" \
|
||||
"mega-chatqna" \
|
||||
"chatqna-xeon-backend-server" \
|
||||
'{"messages": "What is the revenue of Nike in 2023?"}'
|
||||
@@ -50,7 +50,7 @@ function start_services() {
|
||||
export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
|
||||
export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
|
||||
export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
|
||||
export vLLM_RAY_LLM_ENDPOINT="http://${ip_address}:8008"
|
||||
export vLLM_RAY_LLM_ENDPOINT="http://${ip_address}:8006"
|
||||
export LLM_SERVICE_PORT=9000
|
||||
export REDIS_URL="redis://${ip_address}:6379"
|
||||
export INDEX_NAME="rag-redis"
|
||||
@@ -80,12 +80,13 @@ function start_services() {
|
||||
# Start Docker Containers
|
||||
docker compose -f compose_vllm_ray.yaml up -d
|
||||
n=0
|
||||
until [[ "$n" -ge 400 ]]; do
|
||||
until [[ "$n" -ge 25 ]]; do
|
||||
echo "n=$n"
|
||||
docker logs vllm-ray-gaudi-server > vllm_ray_service_start.log
|
||||
if grep -q Connected vllm_ray_service_start.log; then
|
||||
if grep -q "Warmup finished" vllm_ray_service_start.log; then
|
||||
break
|
||||
fi
|
||||
sleep 1s
|
||||
sleep 20s
|
||||
n=$((n+1))
|
||||
done
|
||||
}
|
||||
@@ -166,7 +167,7 @@ function validate_microservices() {
|
||||
|
||||
# vllm-on-ray for llm service
|
||||
validate_services \
|
||||
"${ip_address}:8008/v1/chat/completions" \
|
||||
"${ip_address}:8006/v1/chat/completions" \
|
||||
"content" \
|
||||
"vllm-ray-llm" \
|
||||
"vllm-ray-gaudi-server" \
|
||||
Reference in New Issue
Block a user