Code Enhancement for vllm inference (#1729)

Signed-off-by: Yongbozzz <yongbo.zhu@intel.com>
2025-04-03 13:37:49 +08:00
parent bbd53443ab
commit 1a0c5f03c6
4 changed files with 127 additions and 11 deletions
--- a/EdgeCraftRAG/README.md
+++ b/EdgeCraftRAG/README.md
@@ -17,7 +17,7 @@ quality and performance.
 ### (Optional) Build Docker Images for Mega Service, Server and UI by your own
-If you want to build the images by your own, please follow the steps:
+**All the docker images can be automatically‌ pulled**, If you want to build the images by your own, please follow the steps:
 ```bash
 cd GenAIExamples/EdgeCraftRAG
@@ -101,6 +101,26 @@ export HUGGINGFACEHUB_API_TOKEN=#your HF token
 docker compose -f compose_vllm.yaml up -d
 ```
 #### Launch services with vLLM for multi Intel Arc GPUs inference service
 The docker file can be pulled automatically‌, you can also pull the image manually:
 ```bash
 docker pull intelanalytics/ipex-llm-serving-xpu:latest
 ```
 Set up Additional Environment Variables and start with compose_vllm_multi-arc.yaml
 ```bash
 export LLM_MODEL=#your model id
 export VLLM_SERVICE_PORT=8008
 export vLLM_ENDPOINT="http://${HOST_IP}:${VLLM_SERVICE_PORT}"
 export LLM_MODEL_PATH=#your model path
 export TENSOR_PARALLEL_SIZE=#your Intel Arc GPU number to do inference
 docker compose -f compose_vllm_multi-arc.yaml up -d
 ```
 ### ChatQnA with LLM Example (Command Line)
 ```bash
--- a/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose_vllm_multi-arc.yaml
+++ b/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose_vllm_multi-arc.yaml
@@ -0,0 +1,93 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 services:
  server:
    image: ${REGISTRY:-opea}/edgecraftrag-server:${TAG:-latest}
    container_name: edgecraftrag-server
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_ENDPOINT: ${HF_ENDPOINT}
      vLLM_ENDPOINT: ${vLLM_ENDPOINT}
      LLM_MODEL: ${LLM_MODEL}
      ENABLE_BENCHMARK: ${ENABLE_BENCHMARK:-false}
    volumes:
      - ${MODEL_PATH:-${PWD}}:/home/user/models
      - ${DOC_PATH:-${PWD}}:/home/user/docs
      - ${UI_TMPFILE_PATH:-${PWD}}:/home/user/ui_cache
      - ${HF_CACHE:-${HOME}/.cache}:/home/user/.cache
      - ${PROMPT_PATH:-${PWD}}:/templates/custom
    ports:
      - ${PIPELINE_SERVICE_PORT:-16010}:${PIPELINE_SERVICE_PORT:-16010}
    devices:
      - /dev/dri:/dev/dri
    group_add:
      - ${VIDEOGROUPID:-44}
      - ${RENDERGROUPID:-109}
  ecrag:
    image: ${REGISTRY:-opea}/edgecraftrag:${TAG:-latest}
    container_name: edgecraftrag
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT:-16011}
      MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP:-${HOST_IP}}
      PIPELINE_SERVICE_PORT: ${PIPELINE_SERVICE_PORT:-16010}
      PIPELINE_SERVICE_HOST_IP: ${PIPELINE_SERVICE_HOST_IP:-${HOST_IP}}
    ports:
      - ${MEGA_SERVICE_PORT:-16011}:${MEGA_SERVICE_PORT:-16011}
    depends_on:
      - server
  ui:
    image: ${REGISTRY:-opea}/edgecraftrag-ui:${TAG:-latest}
    container_name: edgecraftrag-ui
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT:-16011}
      MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP:-${HOST_IP}}
      PIPELINE_SERVICE_PORT: ${PIPELINE_SERVICE_PORT:-16010}
      PIPELINE_SERVICE_HOST_IP: ${PIPELINE_SERVICE_HOST_IP:-${HOST_IP}}
      UI_SERVICE_PORT: ${UI_SERVICE_PORT:-8082}
      UI_SERVICE_HOST_IP: ${UI_SERVICE_HOST_IP:-0.0.0.0}
    volumes:
      - ${UI_TMPFILE_PATH:-${PWD}}:/home/user/ui_cache
    ports:
      - ${UI_SERVICE_PORT:-8082}:${UI_SERVICE_PORT:-8082}
    restart: always
    depends_on:
      - server
      - ecrag
  llm-serving-xpu:
    container_name: ipex-llm-serving-xpu-container
    image: intelanalytics/ipex-llm-serving-xpu:latest
    privileged: true
    ports:
      - ${VLLM_SERVICE_PORT:-8008}:8000
    group_add:
      - video
      - ${VIDEOGROUPID:-44}
      - ${RENDERGROUPID:-109}
    volumes:
      - ${LLM_MODEL_PATH:-${PWD}}:/llm/models
    devices:
      - /dev/dri
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_ENDPOINT: ${HF_ENDPOINT}
      MODEL_PATH: "/llm/models"
      SERVED_MODEL_NAME: ${LLM_MODEL}
      TENSOR_PARALLEL_SIZE: ${TENSOR_PARALLEL_SIZE:-1}
    shm_size: '16g'
    entrypoint: /bin/bash -c "\
      cd /llm && \
      bash start-vllm-service.sh"
 networks:
  default:
    driver: bridge
--- a/EdgeCraftRAG/ui/gradio/ecrag_client.py
+++ b/EdgeCraftRAG/ui/gradio/ecrag_client.py
@@ -156,13 +156,16 @@ def get_benchmark(name):
        if data.get("Benchmark enabled", False):
            benchmark_data = data.get("last_benchmark_data", {})
-            if benchmark_data.get("generator", "N/A"):
+            if benchmark_data and "generator" in benchmark_data:
-                benchmark = (
+                if benchmark_data.get("generator", "N/A"):
-                    f"Retrieval: {benchmark_data.get('retriever', 0.0):.4f}s      "
+                    benchmark = (
-                    f"Post-process: {benchmark_data.get('postprocessor', 0.0):.4f}s      "
+                        f"Retrieval: {benchmark_data.get('retriever', 0.0):.4f}s      "
-                    f"Generation: {benchmark_data.get('generator', 0.0):.4f}s"
+                        f"Post-process: {benchmark_data.get('postprocessor', 0.0):.4f}s      "
-                ).rstrip()
+                        f"Generation: {benchmark_data.get('generator', 0.0):.4f}s"
-                return benchmark
+                    ).rstrip()
                    return benchmark
                else:
                    return None
            else:
                return None
        else:
--- a/EdgeCraftRAG/ui/gradio/ecragui.py
+++ b/EdgeCraftRAG/ui/gradio/ecragui.py
@@ -85,9 +85,9 @@ def get_system_status():
 def get_benchmark():
    time.sleep(0.5)
-    active_pipeline_nam = get_actived_pipeline()
+    active_pipeline_name = get_actived_pipeline()
-    if active_pipeline_nam:
+    if active_pipeline_name:
-        data = cli.get_benchmark(active_pipeline_nam)
+        data = cli.get_benchmark(active_pipeline_name)
        if data:
            return gr.update(
                visible=True,