Code Enhancement for vllm inference (#1729)
Signed-off-by: Yongbozzz <yongbo.zhu@intel.com>
This commit is contained in:
@@ -17,7 +17,7 @@ quality and performance.
|
||||
|
||||
### (Optional) Build Docker Images for Mega Service, Server and UI by your own
|
||||
|
||||
If you want to build the images by your own, please follow the steps:
|
||||
**All the docker images can be automatically pulled**, If you want to build the images by your own, please follow the steps:
|
||||
|
||||
```bash
|
||||
cd GenAIExamples/EdgeCraftRAG
|
||||
@@ -101,6 +101,26 @@ export HUGGINGFACEHUB_API_TOKEN=#your HF token
|
||||
docker compose -f compose_vllm.yaml up -d
|
||||
```
|
||||
|
||||
#### Launch services with vLLM for multi Intel Arc GPUs inference service
|
||||
|
||||
The docker file can be pulled automatically, you can also pull the image manually:
|
||||
|
||||
```bash
|
||||
docker pull intelanalytics/ipex-llm-serving-xpu:latest
|
||||
```
|
||||
|
||||
Set up Additional Environment Variables and start with compose_vllm_multi-arc.yaml
|
||||
|
||||
```bash
|
||||
export LLM_MODEL=#your model id
|
||||
export VLLM_SERVICE_PORT=8008
|
||||
export vLLM_ENDPOINT="http://${HOST_IP}:${VLLM_SERVICE_PORT}"
|
||||
export LLM_MODEL_PATH=#your model path
|
||||
export TENSOR_PARALLEL_SIZE=#your Intel Arc GPU number to do inference
|
||||
|
||||
docker compose -f compose_vllm_multi-arc.yaml up -d
|
||||
```
|
||||
|
||||
### ChatQnA with LLM Example (Command Line)
|
||||
|
||||
```bash
|
||||
|
||||
@@ -0,0 +1,93 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
services:
|
||||
server:
|
||||
image: ${REGISTRY:-opea}/edgecraftrag-server:${TAG:-latest}
|
||||
container_name: edgecraftrag-server
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HF_ENDPOINT: ${HF_ENDPOINT}
|
||||
vLLM_ENDPOINT: ${vLLM_ENDPOINT}
|
||||
LLM_MODEL: ${LLM_MODEL}
|
||||
ENABLE_BENCHMARK: ${ENABLE_BENCHMARK:-false}
|
||||
volumes:
|
||||
- ${MODEL_PATH:-${PWD}}:/home/user/models
|
||||
- ${DOC_PATH:-${PWD}}:/home/user/docs
|
||||
- ${UI_TMPFILE_PATH:-${PWD}}:/home/user/ui_cache
|
||||
- ${HF_CACHE:-${HOME}/.cache}:/home/user/.cache
|
||||
- ${PROMPT_PATH:-${PWD}}:/templates/custom
|
||||
ports:
|
||||
- ${PIPELINE_SERVICE_PORT:-16010}:${PIPELINE_SERVICE_PORT:-16010}
|
||||
devices:
|
||||
- /dev/dri:/dev/dri
|
||||
group_add:
|
||||
- ${VIDEOGROUPID:-44}
|
||||
- ${RENDERGROUPID:-109}
|
||||
ecrag:
|
||||
image: ${REGISTRY:-opea}/edgecraftrag:${TAG:-latest}
|
||||
container_name: edgecraftrag
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT:-16011}
|
||||
MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP:-${HOST_IP}}
|
||||
PIPELINE_SERVICE_PORT: ${PIPELINE_SERVICE_PORT:-16010}
|
||||
PIPELINE_SERVICE_HOST_IP: ${PIPELINE_SERVICE_HOST_IP:-${HOST_IP}}
|
||||
ports:
|
||||
- ${MEGA_SERVICE_PORT:-16011}:${MEGA_SERVICE_PORT:-16011}
|
||||
depends_on:
|
||||
- server
|
||||
ui:
|
||||
image: ${REGISTRY:-opea}/edgecraftrag-ui:${TAG:-latest}
|
||||
container_name: edgecraftrag-ui
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT:-16011}
|
||||
MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP:-${HOST_IP}}
|
||||
PIPELINE_SERVICE_PORT: ${PIPELINE_SERVICE_PORT:-16010}
|
||||
PIPELINE_SERVICE_HOST_IP: ${PIPELINE_SERVICE_HOST_IP:-${HOST_IP}}
|
||||
UI_SERVICE_PORT: ${UI_SERVICE_PORT:-8082}
|
||||
UI_SERVICE_HOST_IP: ${UI_SERVICE_HOST_IP:-0.0.0.0}
|
||||
volumes:
|
||||
- ${UI_TMPFILE_PATH:-${PWD}}:/home/user/ui_cache
|
||||
ports:
|
||||
- ${UI_SERVICE_PORT:-8082}:${UI_SERVICE_PORT:-8082}
|
||||
restart: always
|
||||
depends_on:
|
||||
- server
|
||||
- ecrag
|
||||
llm-serving-xpu:
|
||||
container_name: ipex-llm-serving-xpu-container
|
||||
image: intelanalytics/ipex-llm-serving-xpu:latest
|
||||
privileged: true
|
||||
ports:
|
||||
- ${VLLM_SERVICE_PORT:-8008}:8000
|
||||
group_add:
|
||||
- video
|
||||
- ${VIDEOGROUPID:-44}
|
||||
- ${RENDERGROUPID:-109}
|
||||
volumes:
|
||||
- ${LLM_MODEL_PATH:-${PWD}}:/llm/models
|
||||
devices:
|
||||
- /dev/dri
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HF_ENDPOINT: ${HF_ENDPOINT}
|
||||
MODEL_PATH: "/llm/models"
|
||||
SERVED_MODEL_NAME: ${LLM_MODEL}
|
||||
TENSOR_PARALLEL_SIZE: ${TENSOR_PARALLEL_SIZE:-1}
|
||||
shm_size: '16g'
|
||||
entrypoint: /bin/bash -c "\
|
||||
cd /llm && \
|
||||
bash start-vllm-service.sh"
|
||||
networks:
|
||||
default:
|
||||
driver: bridge
|
||||
@@ -156,13 +156,16 @@ def get_benchmark(name):
|
||||
|
||||
if data.get("Benchmark enabled", False):
|
||||
benchmark_data = data.get("last_benchmark_data", {})
|
||||
if benchmark_data.get("generator", "N/A"):
|
||||
benchmark = (
|
||||
f"Retrieval: {benchmark_data.get('retriever', 0.0):.4f}s "
|
||||
f"Post-process: {benchmark_data.get('postprocessor', 0.0):.4f}s "
|
||||
f"Generation: {benchmark_data.get('generator', 0.0):.4f}s"
|
||||
).rstrip()
|
||||
return benchmark
|
||||
if benchmark_data and "generator" in benchmark_data:
|
||||
if benchmark_data.get("generator", "N/A"):
|
||||
benchmark = (
|
||||
f"Retrieval: {benchmark_data.get('retriever', 0.0):.4f}s "
|
||||
f"Post-process: {benchmark_data.get('postprocessor', 0.0):.4f}s "
|
||||
f"Generation: {benchmark_data.get('generator', 0.0):.4f}s"
|
||||
).rstrip()
|
||||
return benchmark
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
|
||||
@@ -85,9 +85,9 @@ def get_system_status():
|
||||
|
||||
def get_benchmark():
|
||||
time.sleep(0.5)
|
||||
active_pipeline_nam = get_actived_pipeline()
|
||||
if active_pipeline_nam:
|
||||
data = cli.get_benchmark(active_pipeline_nam)
|
||||
active_pipeline_name = get_actived_pipeline()
|
||||
if active_pipeline_name:
|
||||
data = cli.get_benchmark(active_pipeline_name)
|
||||
if data:
|
||||
return gr.update(
|
||||
visible=True,
|
||||
|
||||
Reference in New Issue
Block a user