Code Enhancement for vllm inference (#1729)
Signed-off-by: Yongbozzz <yongbo.zhu@intel.com>
This commit is contained in:
@@ -17,7 +17,7 @@ quality and performance.
|
|||||||
|
|
||||||
### (Optional) Build Docker Images for Mega Service, Server and UI by your own
|
### (Optional) Build Docker Images for Mega Service, Server and UI by your own
|
||||||
|
|
||||||
If you want to build the images by your own, please follow the steps:
|
**All the docker images can be automatically pulled**, If you want to build the images by your own, please follow the steps:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd GenAIExamples/EdgeCraftRAG
|
cd GenAIExamples/EdgeCraftRAG
|
||||||
@@ -101,6 +101,26 @@ export HUGGINGFACEHUB_API_TOKEN=#your HF token
|
|||||||
docker compose -f compose_vllm.yaml up -d
|
docker compose -f compose_vllm.yaml up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Launch services with vLLM for multi Intel Arc GPUs inference service
|
||||||
|
|
||||||
|
The docker file can be pulled automatically, you can also pull the image manually:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker pull intelanalytics/ipex-llm-serving-xpu:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
Set up Additional Environment Variables and start with compose_vllm_multi-arc.yaml
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export LLM_MODEL=#your model id
|
||||||
|
export VLLM_SERVICE_PORT=8008
|
||||||
|
export vLLM_ENDPOINT="http://${HOST_IP}:${VLLM_SERVICE_PORT}"
|
||||||
|
export LLM_MODEL_PATH=#your model path
|
||||||
|
export TENSOR_PARALLEL_SIZE=#your Intel Arc GPU number to do inference
|
||||||
|
|
||||||
|
docker compose -f compose_vllm_multi-arc.yaml up -d
|
||||||
|
```
|
||||||
|
|
||||||
### ChatQnA with LLM Example (Command Line)
|
### ChatQnA with LLM Example (Command Line)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@@ -0,0 +1,93 @@
|
|||||||
|
# Copyright (C) 2024 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
services:
|
||||||
|
server:
|
||||||
|
image: ${REGISTRY:-opea}/edgecraftrag-server:${TAG:-latest}
|
||||||
|
container_name: edgecraftrag-server
|
||||||
|
environment:
|
||||||
|
no_proxy: ${no_proxy}
|
||||||
|
http_proxy: ${http_proxy}
|
||||||
|
https_proxy: ${https_proxy}
|
||||||
|
HF_ENDPOINT: ${HF_ENDPOINT}
|
||||||
|
vLLM_ENDPOINT: ${vLLM_ENDPOINT}
|
||||||
|
LLM_MODEL: ${LLM_MODEL}
|
||||||
|
ENABLE_BENCHMARK: ${ENABLE_BENCHMARK:-false}
|
||||||
|
volumes:
|
||||||
|
- ${MODEL_PATH:-${PWD}}:/home/user/models
|
||||||
|
- ${DOC_PATH:-${PWD}}:/home/user/docs
|
||||||
|
- ${UI_TMPFILE_PATH:-${PWD}}:/home/user/ui_cache
|
||||||
|
- ${HF_CACHE:-${HOME}/.cache}:/home/user/.cache
|
||||||
|
- ${PROMPT_PATH:-${PWD}}:/templates/custom
|
||||||
|
ports:
|
||||||
|
- ${PIPELINE_SERVICE_PORT:-16010}:${PIPELINE_SERVICE_PORT:-16010}
|
||||||
|
devices:
|
||||||
|
- /dev/dri:/dev/dri
|
||||||
|
group_add:
|
||||||
|
- ${VIDEOGROUPID:-44}
|
||||||
|
- ${RENDERGROUPID:-109}
|
||||||
|
ecrag:
|
||||||
|
image: ${REGISTRY:-opea}/edgecraftrag:${TAG:-latest}
|
||||||
|
container_name: edgecraftrag
|
||||||
|
environment:
|
||||||
|
no_proxy: ${no_proxy}
|
||||||
|
http_proxy: ${http_proxy}
|
||||||
|
https_proxy: ${https_proxy}
|
||||||
|
MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT:-16011}
|
||||||
|
MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP:-${HOST_IP}}
|
||||||
|
PIPELINE_SERVICE_PORT: ${PIPELINE_SERVICE_PORT:-16010}
|
||||||
|
PIPELINE_SERVICE_HOST_IP: ${PIPELINE_SERVICE_HOST_IP:-${HOST_IP}}
|
||||||
|
ports:
|
||||||
|
- ${MEGA_SERVICE_PORT:-16011}:${MEGA_SERVICE_PORT:-16011}
|
||||||
|
depends_on:
|
||||||
|
- server
|
||||||
|
ui:
|
||||||
|
image: ${REGISTRY:-opea}/edgecraftrag-ui:${TAG:-latest}
|
||||||
|
container_name: edgecraftrag-ui
|
||||||
|
environment:
|
||||||
|
no_proxy: ${no_proxy}
|
||||||
|
http_proxy: ${http_proxy}
|
||||||
|
https_proxy: ${https_proxy}
|
||||||
|
MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT:-16011}
|
||||||
|
MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP:-${HOST_IP}}
|
||||||
|
PIPELINE_SERVICE_PORT: ${PIPELINE_SERVICE_PORT:-16010}
|
||||||
|
PIPELINE_SERVICE_HOST_IP: ${PIPELINE_SERVICE_HOST_IP:-${HOST_IP}}
|
||||||
|
UI_SERVICE_PORT: ${UI_SERVICE_PORT:-8082}
|
||||||
|
UI_SERVICE_HOST_IP: ${UI_SERVICE_HOST_IP:-0.0.0.0}
|
||||||
|
volumes:
|
||||||
|
- ${UI_TMPFILE_PATH:-${PWD}}:/home/user/ui_cache
|
||||||
|
ports:
|
||||||
|
- ${UI_SERVICE_PORT:-8082}:${UI_SERVICE_PORT:-8082}
|
||||||
|
restart: always
|
||||||
|
depends_on:
|
||||||
|
- server
|
||||||
|
- ecrag
|
||||||
|
llm-serving-xpu:
|
||||||
|
container_name: ipex-llm-serving-xpu-container
|
||||||
|
image: intelanalytics/ipex-llm-serving-xpu:latest
|
||||||
|
privileged: true
|
||||||
|
ports:
|
||||||
|
- ${VLLM_SERVICE_PORT:-8008}:8000
|
||||||
|
group_add:
|
||||||
|
- video
|
||||||
|
- ${VIDEOGROUPID:-44}
|
||||||
|
- ${RENDERGROUPID:-109}
|
||||||
|
volumes:
|
||||||
|
- ${LLM_MODEL_PATH:-${PWD}}:/llm/models
|
||||||
|
devices:
|
||||||
|
- /dev/dri
|
||||||
|
environment:
|
||||||
|
no_proxy: ${no_proxy}
|
||||||
|
http_proxy: ${http_proxy}
|
||||||
|
https_proxy: ${https_proxy}
|
||||||
|
HF_ENDPOINT: ${HF_ENDPOINT}
|
||||||
|
MODEL_PATH: "/llm/models"
|
||||||
|
SERVED_MODEL_NAME: ${LLM_MODEL}
|
||||||
|
TENSOR_PARALLEL_SIZE: ${TENSOR_PARALLEL_SIZE:-1}
|
||||||
|
shm_size: '16g'
|
||||||
|
entrypoint: /bin/bash -c "\
|
||||||
|
cd /llm && \
|
||||||
|
bash start-vllm-service.sh"
|
||||||
|
networks:
|
||||||
|
default:
|
||||||
|
driver: bridge
|
||||||
@@ -156,13 +156,16 @@ def get_benchmark(name):
|
|||||||
|
|
||||||
if data.get("Benchmark enabled", False):
|
if data.get("Benchmark enabled", False):
|
||||||
benchmark_data = data.get("last_benchmark_data", {})
|
benchmark_data = data.get("last_benchmark_data", {})
|
||||||
if benchmark_data.get("generator", "N/A"):
|
if benchmark_data and "generator" in benchmark_data:
|
||||||
benchmark = (
|
if benchmark_data.get("generator", "N/A"):
|
||||||
f"Retrieval: {benchmark_data.get('retriever', 0.0):.4f}s "
|
benchmark = (
|
||||||
f"Post-process: {benchmark_data.get('postprocessor', 0.0):.4f}s "
|
f"Retrieval: {benchmark_data.get('retriever', 0.0):.4f}s "
|
||||||
f"Generation: {benchmark_data.get('generator', 0.0):.4f}s"
|
f"Post-process: {benchmark_data.get('postprocessor', 0.0):.4f}s "
|
||||||
).rstrip()
|
f"Generation: {benchmark_data.get('generator', 0.0):.4f}s"
|
||||||
return benchmark
|
).rstrip()
|
||||||
|
return benchmark
|
||||||
|
else:
|
||||||
|
return None
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -85,9 +85,9 @@ def get_system_status():
|
|||||||
|
|
||||||
def get_benchmark():
|
def get_benchmark():
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
active_pipeline_nam = get_actived_pipeline()
|
active_pipeline_name = get_actived_pipeline()
|
||||||
if active_pipeline_nam:
|
if active_pipeline_name:
|
||||||
data = cli.get_benchmark(active_pipeline_nam)
|
data = cli.get_benchmark(active_pipeline_name)
|
||||||
if data:
|
if data:
|
||||||
return gr.update(
|
return gr.update(
|
||||||
visible=True,
|
visible=True,
|
||||||
|
|||||||
Reference in New Issue
Block a user