Enable vllm for DocSum (#1716)

Set vllm as default llm serving, and add related docker compose files, readmes, and test scripts.

Fix issue #1436

Signed-off-by: letonghan <letong.han@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Letong Han
2025-03-28 17:15:01 +08:00
committed by GitHub
parent 87baeb833d
commit d4dcbd18ef
12 changed files with 1403 additions and 317 deletions

View File

@@ -2,6 +2,8 @@
This document outlines the deployment process for a Document Summarization application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `llm`. We will publish the Docker images to Docker Hub soon, which will simplify the deployment process for this service.
The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [start-microservice-docker-containers](#start-microservice-docker-containers) section in this page.
## 🚀 Build Docker Images
### 1. Build MicroService Docker Image
@@ -108,9 +110,20 @@ To set up environment variables for deploying Document Summarization services, f
```bash
cd GenAIExamples/DocSum/docker_compose/intel/hpu/gaudi
```
If use vLLM as the LLM serving backend.
```bash
docker compose -f compose.yaml up -d
```
If use TGI as the LLM serving backend.
```bash
docker compose -f compose_tgi.yaml up -d
```
You will have the following Docker Images:
1. `opea/docsum-ui:latest`
@@ -120,10 +133,30 @@ You will have the following Docker Images:
### Validate Microservices
1. TGI Service
1. LLM backend Service
In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
Try the command below to check whether the LLM serving is ready.
```bash
curl http://${host_ip}:8008/generate \
# vLLM service
docker logs docsum-xeon-vllm-service 2>&1 | grep complete
# If the service is ready, you will get the response like below.
INFO: Application startup complete.
```
```bash
# TGI service
docker logs docsum-xeon-tgi-service | grep Connected
# If the service is ready, you will get the response like below.
2024-09-03T02:47:53.402023Z INFO text_generation_router::server: router/src/server.rs:2311: Connected
```
Then try the `cURL` command below to validate services.
```bash
# either vLLM or TGI service
curl http://${host_ip}:8008/v1/chat/completions \
-X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-H 'Content-Type: application/json'

View File

@@ -2,47 +2,42 @@
# SPDX-License-Identifier: Apache-2.0
services:
tgi-gaudi-server:
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
container_name: tgi-gaudi-server
vllm-service:
image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
container_name: docsum-gaudi-vllm-service
ports:
- ${LLM_ENDPOINT_PORT:-8008}:80
- "8008:80"
volumes:
- "${DATA_PATH:-./data}:/data"
- "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
host_ip: ${host_ip}
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
NUM_CARDS: ${NUM_CARDS}
VLLM_TORCH_PROFILER_DIR: "/mnt"
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
runtime: habana
cap_add:
- SYS_NICE
ipc: host
healthcheck:
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
llm-docsum-tgi:
llm-docsum-vllm:
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
container_name: llm-docsum-gaudi-server
container_name: docsum-gaudi-llm-server
depends_on:
tgi-gaudi-server:
vllm-service:
condition: service_healthy
ports:
- ${DOCSUM_PORT:-9000}:9000
- ${LLM_PORT:-9000}:9000
ipc: host
environment:
no_proxy: ${no_proxy}
@@ -59,7 +54,7 @@ services:
whisper:
image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
container_name: whisper-server
container_name: docsum-gaudi-whisper-server
ports:
- "7066:7066"
ipc: host
@@ -78,10 +73,10 @@ services:
image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
container_name: docsum-gaudi-backend-server
depends_on:
- tgi-gaudi-server
- llm-docsum-tgi
- vllm-service
- llm-docsum-vllm
ports:
- "8888:8888"
- "${BACKEND_SERVICE_PORT:-8888}:8888"
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
@@ -99,7 +94,7 @@ services:
depends_on:
- docsum-gaudi-backend-server
ports:
- "5173:5173"
- "${FRONTEND_SERVICE_PORT:-5173}:5173"
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}

View File

@@ -0,0 +1,114 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
services:
tgi-gaudi-server:
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
container_name: docsum-gaudi-tgi-server
ports:
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- "${MODEL_CACHE}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
host_ip: ${host_ip}
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
runtime: habana
cap_add:
- SYS_NICE
ipc: host
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
llm-docsum-tgi:
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
container_name: docsum-gaudi-llm-server
depends_on:
tgi-gaudi-server:
condition: service_healthy
ports:
- ${LLM_PORT:-9000}:9000
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
LLM_ENDPOINT: ${LLM_ENDPOINT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
LOGFLAG: ${LOGFLAG:-False}
restart: unless-stopped
whisper:
image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
container_name: docsum-gaudi-whisper-server
ports:
- "7066:7066"
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
runtime: habana
cap_add:
- SYS_NICE
restart: unless-stopped
docsum-gaudi-backend-server:
image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
container_name: docsum-gaudi-backend-server
depends_on:
- tgi-gaudi-server
- llm-docsum-tgi
ports:
- "${BACKEND_SERVICE_PORT:-8888}:8888"
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
- http_proxy=${http_proxy}
- MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
- LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
- ASR_SERVICE_HOST_IP=${ASR_SERVICE_HOST_IP}
ipc: host
restart: always
docsum-gradio-ui:
image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
container_name: docsum-gaudi-ui-server
depends_on:
- docsum-gaudi-backend-server
ports:
- "${FRONTEND_SERVICE_PORT:-5173}:5173"
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
- http_proxy=${http_proxy}
- BACKEND_SERVICE_ENDPOINT=${BACKEND_SERVICE_ENDPOINT}
- DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
ipc: host
restart: always
networks:
default:
driver: bridge