Code Enhancement for vllm inference (#1729)

Signed-off-by: Yongbozzz <yongbo.zhu@intel.com>
This commit is contained in:
Zhu Yongbo
2025-04-03 13:37:49 +08:00
committed by GitHub
parent bbd53443ab
commit 1a0c5f03c6
4 changed files with 127 additions and 11 deletions

View File

@@ -17,7 +17,7 @@ quality and performance.
### (Optional) Build Docker Images for Mega Service, Server and UI by your own
If you want to build the images by your own, please follow the steps:
**All the docker images can be automatically pulled**, If you want to build the images by your own, please follow the steps:
```bash
cd GenAIExamples/EdgeCraftRAG
@@ -101,6 +101,26 @@ export HUGGINGFACEHUB_API_TOKEN=#your HF token
docker compose -f compose_vllm.yaml up -d
```
#### Launch services with vLLM for multi Intel Arc GPUs inference service
The docker file can be pulled automatically, you can also pull the image manually:
```bash
docker pull intelanalytics/ipex-llm-serving-xpu:latest
```
Set up Additional Environment Variables and start with compose_vllm_multi-arc.yaml
```bash
export LLM_MODEL=#your model id
export VLLM_SERVICE_PORT=8008
export vLLM_ENDPOINT="http://${HOST_IP}:${VLLM_SERVICE_PORT}"
export LLM_MODEL_PATH=#your model path
export TENSOR_PARALLEL_SIZE=#your Intel Arc GPU number to do inference
docker compose -f compose_vllm_multi-arc.yaml up -d
```
### ChatQnA with LLM Example (Command Line)
```bash

View File

@@ -0,0 +1,93 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
services:
server:
image: ${REGISTRY:-opea}/edgecraftrag-server:${TAG:-latest}
container_name: edgecraftrag-server
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_ENDPOINT: ${HF_ENDPOINT}
vLLM_ENDPOINT: ${vLLM_ENDPOINT}
LLM_MODEL: ${LLM_MODEL}
ENABLE_BENCHMARK: ${ENABLE_BENCHMARK:-false}
volumes:
- ${MODEL_PATH:-${PWD}}:/home/user/models
- ${DOC_PATH:-${PWD}}:/home/user/docs
- ${UI_TMPFILE_PATH:-${PWD}}:/home/user/ui_cache
- ${HF_CACHE:-${HOME}/.cache}:/home/user/.cache
- ${PROMPT_PATH:-${PWD}}:/templates/custom
ports:
- ${PIPELINE_SERVICE_PORT:-16010}:${PIPELINE_SERVICE_PORT:-16010}
devices:
- /dev/dri:/dev/dri
group_add:
- ${VIDEOGROUPID:-44}
- ${RENDERGROUPID:-109}
ecrag:
image: ${REGISTRY:-opea}/edgecraftrag:${TAG:-latest}
container_name: edgecraftrag
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT:-16011}
MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP:-${HOST_IP}}
PIPELINE_SERVICE_PORT: ${PIPELINE_SERVICE_PORT:-16010}
PIPELINE_SERVICE_HOST_IP: ${PIPELINE_SERVICE_HOST_IP:-${HOST_IP}}
ports:
- ${MEGA_SERVICE_PORT:-16011}:${MEGA_SERVICE_PORT:-16011}
depends_on:
- server
ui:
image: ${REGISTRY:-opea}/edgecraftrag-ui:${TAG:-latest}
container_name: edgecraftrag-ui
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT:-16011}
MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP:-${HOST_IP}}
PIPELINE_SERVICE_PORT: ${PIPELINE_SERVICE_PORT:-16010}
PIPELINE_SERVICE_HOST_IP: ${PIPELINE_SERVICE_HOST_IP:-${HOST_IP}}
UI_SERVICE_PORT: ${UI_SERVICE_PORT:-8082}
UI_SERVICE_HOST_IP: ${UI_SERVICE_HOST_IP:-0.0.0.0}
volumes:
- ${UI_TMPFILE_PATH:-${PWD}}:/home/user/ui_cache
ports:
- ${UI_SERVICE_PORT:-8082}:${UI_SERVICE_PORT:-8082}
restart: always
depends_on:
- server
- ecrag
llm-serving-xpu:
container_name: ipex-llm-serving-xpu-container
image: intelanalytics/ipex-llm-serving-xpu:latest
privileged: true
ports:
- ${VLLM_SERVICE_PORT:-8008}:8000
group_add:
- video
- ${VIDEOGROUPID:-44}
- ${RENDERGROUPID:-109}
volumes:
- ${LLM_MODEL_PATH:-${PWD}}:/llm/models
devices:
- /dev/dri
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_ENDPOINT: ${HF_ENDPOINT}
MODEL_PATH: "/llm/models"
SERVED_MODEL_NAME: ${LLM_MODEL}
TENSOR_PARALLEL_SIZE: ${TENSOR_PARALLEL_SIZE:-1}
shm_size: '16g'
entrypoint: /bin/bash -c "\
cd /llm && \
bash start-vllm-service.sh"
networks:
default:
driver: bridge

View File

@@ -156,13 +156,16 @@ def get_benchmark(name):
if data.get("Benchmark enabled", False):
benchmark_data = data.get("last_benchmark_data", {})
if benchmark_data.get("generator", "N/A"):
benchmark = (
f"Retrieval: {benchmark_data.get('retriever', 0.0):.4f}s "
f"Post-process: {benchmark_data.get('postprocessor', 0.0):.4f}s "
f"Generation: {benchmark_data.get('generator', 0.0):.4f}s"
).rstrip()
return benchmark
if benchmark_data and "generator" in benchmark_data:
if benchmark_data.get("generator", "N/A"):
benchmark = (
f"Retrieval: {benchmark_data.get('retriever', 0.0):.4f}s "
f"Post-process: {benchmark_data.get('postprocessor', 0.0):.4f}s "
f"Generation: {benchmark_data.get('generator', 0.0):.4f}s"
).rstrip()
return benchmark
else:
return None
else:
return None
else:

View File

@@ -85,9 +85,9 @@ def get_system_status():
def get_benchmark():
time.sleep(0.5)
active_pipeline_nam = get_actived_pipeline()
if active_pipeline_nam:
data = cli.get_benchmark(active_pipeline_nam)
active_pipeline_name = get_actived_pipeline()
if active_pipeline_name:
data = cli.get_benchmark(active_pipeline_name)
if data:
return gr.update(
visible=True,