Fix vLLM and vLLM-on-Ray UT bug (#580)

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> Co-authored-by: chen, suyue <suyue.chen@intel.com>
2024-08-14 18:03:25 +08:00
parent d68be058f5
commit cfcac3f0ec
8 changed files with 37 additions and 34 deletions
--- a/ChatQnA/docker/gaudi/README.md
+++ b/ChatQnA/docker/gaudi/README.md
@@ -173,9 +173,9 @@ export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
 export LLM_MODEL_ID_NAME="neural-chat-7b-v3-3"
 export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:8090"
 export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808"
-export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
-export vLLM_LLM_ENDPOINT="http://${host_ip}:8008"
-export vLLM_RAY_LLM_ENDPOINT="http://${host_ip}:8008"
+export TGI_LLM_ENDPOINT="http://${host_ip}:8005"
+export vLLM_LLM_ENDPOINT="http://${host_ip}:8007"
+export vLLM_RAY_LLM_ENDPOINT="http://${host_ip}:8006"
 export LLM_SERVICE_PORT=9000
 export REDIS_URL="redis://${host_ip}:6379"
 export INDEX_NAME="rag-redis"
@@ -296,7 +296,7 @@ curl http://${host_ip}:8000/v1/reranking \

 ```bash
 #TGI Service
-curl http://${host_ip}:8008/generate \
+curl http://${host_ip}:8005/generate \
  -X POST \
  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
  -H 'Content-Type: application/json'
@@ -304,7 +304,7 @@ curl http://${host_ip}:8008/generate \

 ```bash
 #vLLM Service
-curl http://${host_ip}:8008/v1/completions \
+curl http://${host_ip}:8007/v1/completions \
  -H "Content-Type: application/json" \
  -d '{
  "model": "${LLM_MODEL_ID}",
@@ -316,7 +316,7 @@ curl http://${host_ip}:8008/v1/completions \

 ```bash
 #vLLM-on-Ray Service
-curl http://${host_ip}:8008/v1/chat/completions \
+curl http://${host_ip}:8006/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{"model": "${LLM_MODEL_ID}", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
 ```
--- a/ChatQnA/docker/gaudi/compose.yaml
+++ b/ChatQnA/docker/gaudi/compose.yaml
@@ -114,7 +114,7 @@ services:
    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
    container_name: tgi-gaudi-server
    ports:
-      - "8008:80"
+      - "8005:80"
    volumes:
      - "./data:/data"
    environment:
--- a/ChatQnA/docker/gaudi/compose_vllm.yaml
+++ b/ChatQnA/docker/gaudi/compose_vllm.yaml
@@ -112,7 +112,7 @@ services:
    image: opea/llm-vllm-hpu:latest
    container_name: vllm-gaudi-server
    ports:
-      - "8008:80"
+      - "8007:80"
    volumes:
      - "./data:/data"
    environment:
@@ -122,12 +122,12 @@ services:
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      LLM_MODEL: ${LLM_MODEL_ID}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
    runtime: habana
    cap_add:
      - SYS_NICE
    ipc: host
-    command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80"
+    command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"
  llm:
    image: opea/llm-vllm:latest
    container_name: llm-vllm-gaudi-server
--- a/ChatQnA/docker/gaudi/compose_vllm_ray.yaml
+++ b/ChatQnA/docker/gaudi/compose_vllm_ray.yaml
@@ -112,7 +112,7 @@ services:
    image: opea/llm-vllm-ray-hpu:latest
    container_name: vllm-ray-gaudi-server
    ports:
-      - "8008:8000"
+      - "8006:8000"
    volumes:
      - "./data:/data"
    environment:
@@ -122,12 +122,12 @@ services:
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      LLM_MODEL: ${LLM_MODEL_ID}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
    runtime: habana
    cap_add:
      - SYS_NICE
    ipc: host
-    command: /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL --tensor_parallel_size 2 --enforce_eager True"
+    command: /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL_ID --tensor_parallel_size 2 --enforce_eager True"
  llm:
    image: opea/llm-vllm-ray:latest
    container_name: llm-vllm-ray-gaudi-server
--- a/ChatQnA/tests/test_chatqna_on_gaudi.sh
+++ b/ChatQnA/tests/test_chatqna_on_gaudi.sh
@@ -50,7 +50,7 @@ function start_services() {
    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
    export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
    export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
-    export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
+    export TGI_LLM_ENDPOINT="http://${ip_address}:8005"
    export REDIS_URL="redis://${ip_address}:6379"
    export REDIS_HOST=${ip_address}
    export INDEX_NAME="rag-redis"
@@ -215,7 +215,7 @@ function validate_microservices() {

    # tgi for llm service
    validate_service \
-        "${ip_address}:8008/generate" \
+        "${ip_address}:8005/generate" \
        "generated_text" \
        "tgi-llm" \
        "tgi-gaudi-server" \
--- a/ChatQnA/tests/_test_chatqna_vllm_on_gaudi.sh
+++ b/ChatQnA/tests/_test_chatqna_vllm_on_gaudi.sh
@@ -50,7 +50,8 @@ function start_services() {
    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
    export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
    export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
-    export vLLM_LLM_ENDPOINT="http://${ip_address}:8008"
+    export vLLM_LLM_ENDPOINT="http://${ip_address}:8007"
+    export LLM_SERVICE_PORT=9000
    export REDIS_URL="redis://${ip_address}:6379"
    export INDEX_NAME="rag-redis"
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
@@ -79,12 +80,13 @@ function start_services() {
    # Start Docker Containers
    docker compose -f compose_vllm.yaml up -d
    n=0
-    until [[ "$n" -ge 180 ]]; do
+    until [[ "$n" -ge 25 ]]; do
+        echo "n=$n"
        docker logs vllm-gaudi-server > vllm_service_start.log
-        if grep -q Connected vllm_service_start.log; then
+        if grep -q "Warmup finished" vllm_service_start.log; then
            break
        fi
-        sleep 1s
+        sleep 20s
        n=$((n+1))
    done
 }
@@ -165,7 +167,7 @@ function validate_microservices() {

    # vllm for llm service
    validate_services \
-        "${ip_address}:8008/v1/completions" \
+        "${ip_address}:8007/v1/completions" \
        "text" \
        "vllm-llm" \
        "vllm-gaudi-server" \
@@ -185,7 +187,7 @@ function validate_megaservice() {
    # Curl the Mega Service
    validate_services \
        "${ip_address}:8888/v1/chatqna" \
-        "billion" \
+        "data:" \
        "mega-chatqna" \
        "chatqna-gaudi-backend-server" \
        '{"messages": "What is the revenue of Nike in 2023?"}'
--- a/ChatQnA/tests/_test_chatqna_vllm_on_xeon.sh
+++ b/ChatQnA/tests/_test_chatqna_vllm_on_xeon.sh
@@ -26,16 +26,15 @@ function build_docker_images() {
    cd $WORKPATH/docker/ui
    docker build --no-cache -t opea/chatqna-ui:latest -f docker/Dockerfile .

-#    cd $WORKPATH
-#    git clone https://github.com/vllm-project/vllm.git
-#    cd vllm
-#    docker build --no-cache -t opea/vllm:latest -f Dockerfile.cpu .
+    # cd $WORKPATH
+    # git clone https://github.com/vllm-project/vllm.git
+    # cd vllm
+    # docker build --no-cache -t opea/vllm:latest -f Dockerfile.cpu .

    docker images
 }

 function start_services() {
-    # build vllm for each test instead of pull from local registry
    cd $WORKPATH
    git clone https://github.com/vllm-project/vllm.git
    cd vllm
@@ -73,18 +72,19 @@ function start_services() {
            sed -i "s#image: opea/chatqna-ui:latest#image: opea/chatqna-ui:${IMAGE_TAG}#g" compose_vllm.yaml
            sed -i "s#image: opea/chatqna-conversation-ui:latest#image: opea/chatqna-conversation-ui:${IMAGE_TAG}#g" compose_vllm.yaml
            sed -i "s#image: opea/*#image: ${IMAGE_REPO}opea/#g" compose_vllm.yaml
+            sed -i "s#image: ${IMAGE_REPO}opea/vllm:latest#image: opea/vllm:latest#g" compose_vllm.yaml
        fi
    fi

    # Start Docker Containers
    docker compose -f compose_vllm.yaml up -d
    n=0
-    until [[ "$n" -ge 100 ]]; do
+    until [[ "$n" -ge 10 ]]; do
        docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log
        if grep -q Connected ${LOG_PATH}/vllm_service_start.log; then
            break
        fi
-        sleep 1s
+        sleep 10s
        n=$((n+1))
    done
 }
@@ -185,7 +185,7 @@ function validate_megaservice() {
    # Curl the Mega Service
    validate_services \
        "${ip_address}:8888/v1/chatqna" \
-        "billion" \
+        "data" \
        "mega-chatqna" \
        "chatqna-xeon-backend-server" \
        '{"messages": "What is the revenue of Nike in 2023?"}'
--- a/ChatQnA/tests/_test_chatqna_vllm_ray_on_gaudi.sh
+++ b/ChatQnA/tests/_test_chatqna_vllm_ray_on_gaudi.sh
@@ -50,7 +50,7 @@ function start_services() {
    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
    export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
    export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
-    export vLLM_RAY_LLM_ENDPOINT="http://${ip_address}:8008"
+    export vLLM_RAY_LLM_ENDPOINT="http://${ip_address}:8006"
    export LLM_SERVICE_PORT=9000
    export REDIS_URL="redis://${ip_address}:6379"
    export INDEX_NAME="rag-redis"
@@ -80,12 +80,13 @@ function start_services() {
    # Start Docker Containers
    docker compose -f compose_vllm_ray.yaml up -d
    n=0
-    until [[ "$n" -ge 400 ]]; do
+    until [[ "$n" -ge 25 ]]; do
+        echo "n=$n"
        docker logs vllm-ray-gaudi-server > vllm_ray_service_start.log
-        if grep -q Connected vllm_ray_service_start.log; then
+        if grep -q "Warmup finished" vllm_ray_service_start.log; then
            break
        fi
-        sleep 1s
+        sleep 20s
        n=$((n+1))
    done
 }
@@ -166,7 +167,7 @@ function validate_microservices() {

    # vllm-on-ray for llm service
    validate_services \
-        "${ip_address}:8008/v1/chat/completions" \
+        "${ip_address}:8006/v1/chat/completions" \
        "content" \
        "vllm-ray-llm" \
        "vllm-ray-gaudi-server" \