Enable vllm for DocSum (#1716)
Set vllm as default llm serving, and add related docker compose files, readmes, and test scripts. Fix issue #1436 Signed-off-by: letonghan <letong.han@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
@@ -2,6 +2,8 @@
|
||||
|
||||
This document outlines the deployment process for a Document Summarization application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `llm`. We will publish the Docker images to Docker Hub soon, which will simplify the deployment process for this service.
|
||||
|
||||
The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [start-microservice-docker-containers](#start-microservice-docker-containers) section in this page.
|
||||
|
||||
## 🚀 Build Docker Images
|
||||
|
||||
### 1. Build MicroService Docker Image
|
||||
@@ -108,9 +110,20 @@ To set up environment variables for deploying Document Summarization services, f
|
||||
|
||||
```bash
|
||||
cd GenAIExamples/DocSum/docker_compose/intel/hpu/gaudi
|
||||
```
|
||||
|
||||
If use vLLM as the LLM serving backend.
|
||||
|
||||
```bash
|
||||
docker compose -f compose.yaml up -d
|
||||
```
|
||||
|
||||
If use TGI as the LLM serving backend.
|
||||
|
||||
```bash
|
||||
docker compose -f compose_tgi.yaml up -d
|
||||
```
|
||||
|
||||
You will have the following Docker Images:
|
||||
|
||||
1. `opea/docsum-ui:latest`
|
||||
@@ -120,10 +133,30 @@ You will have the following Docker Images:
|
||||
|
||||
### Validate Microservices
|
||||
|
||||
1. TGI Service
|
||||
1. LLM backend Service
|
||||
|
||||
In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
|
||||
Try the command below to check whether the LLM serving is ready.
|
||||
|
||||
```bash
|
||||
curl http://${host_ip}:8008/generate \
|
||||
# vLLM service
|
||||
docker logs docsum-xeon-vllm-service 2>&1 | grep complete
|
||||
# If the service is ready, you will get the response like below.
|
||||
INFO: Application startup complete.
|
||||
```
|
||||
|
||||
```bash
|
||||
# TGI service
|
||||
docker logs docsum-xeon-tgi-service | grep Connected
|
||||
# If the service is ready, you will get the response like below.
|
||||
2024-09-03T02:47:53.402023Z INFO text_generation_router::server: router/src/server.rs:2311: Connected
|
||||
```
|
||||
|
||||
Then try the `cURL` command below to validate services.
|
||||
|
||||
```bash
|
||||
# either vLLM or TGI service
|
||||
curl http://${host_ip}:8008/v1/chat/completions \
|
||||
-X POST \
|
||||
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
@@ -2,47 +2,42 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
services:
|
||||
tgi-gaudi-server:
|
||||
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
|
||||
container_name: tgi-gaudi-server
|
||||
vllm-service:
|
||||
image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
|
||||
container_name: docsum-gaudi-vllm-service
|
||||
ports:
|
||||
- ${LLM_ENDPOINT_PORT:-8008}:80
|
||||
- "8008:80"
|
||||
volumes:
|
||||
- "${DATA_PATH:-./data}:/data"
|
||||
- "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
HF_HUB_DISABLE_PROGRESS_BARS: 1
|
||||
HF_HUB_ENABLE_HF_TRANSFER: 0
|
||||
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
HABANA_VISIBLE_DEVICES: all
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
ENABLE_HPU_GRAPH: true
|
||||
LIMIT_HPU_GRAPH: true
|
||||
USE_FLASH_ATTENTION: true
|
||||
FLASH_ATTENTION_RECOMPUTE: true
|
||||
host_ip: ${host_ip}
|
||||
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
NUM_CARDS: ${NUM_CARDS}
|
||||
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 100
|
||||
runtime: habana
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
ipc: host
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 100
|
||||
command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
|
||||
command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
|
||||
|
||||
llm-docsum-tgi:
|
||||
llm-docsum-vllm:
|
||||
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
|
||||
container_name: llm-docsum-gaudi-server
|
||||
container_name: docsum-gaudi-llm-server
|
||||
depends_on:
|
||||
tgi-gaudi-server:
|
||||
vllm-service:
|
||||
condition: service_healthy
|
||||
ports:
|
||||
- ${DOCSUM_PORT:-9000}:9000
|
||||
- ${LLM_PORT:-9000}:9000
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
@@ -59,7 +54,7 @@ services:
|
||||
|
||||
whisper:
|
||||
image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
|
||||
container_name: whisper-server
|
||||
container_name: docsum-gaudi-whisper-server
|
||||
ports:
|
||||
- "7066:7066"
|
||||
ipc: host
|
||||
@@ -78,10 +73,10 @@ services:
|
||||
image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
|
||||
container_name: docsum-gaudi-backend-server
|
||||
depends_on:
|
||||
- tgi-gaudi-server
|
||||
- llm-docsum-tgi
|
||||
- vllm-service
|
||||
- llm-docsum-vllm
|
||||
ports:
|
||||
- "8888:8888"
|
||||
- "${BACKEND_SERVICE_PORT:-8888}:8888"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
@@ -99,7 +94,7 @@ services:
|
||||
depends_on:
|
||||
- docsum-gaudi-backend-server
|
||||
ports:
|
||||
- "5173:5173"
|
||||
- "${FRONTEND_SERVICE_PORT:-5173}:5173"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
|
||||
114
DocSum/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
Normal file
114
DocSum/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
Normal file
@@ -0,0 +1,114 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
services:
|
||||
tgi-gaudi-server:
|
||||
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
|
||||
container_name: docsum-gaudi-tgi-server
|
||||
ports:
|
||||
- ${LLM_ENDPOINT_PORT:-8008}:80
|
||||
volumes:
|
||||
- "${MODEL_CACHE}:/data"
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
HF_HUB_DISABLE_PROGRESS_BARS: 1
|
||||
HF_HUB_ENABLE_HF_TRANSFER: 0
|
||||
HABANA_VISIBLE_DEVICES: all
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
ENABLE_HPU_GRAPH: true
|
||||
LIMIT_HPU_GRAPH: true
|
||||
USE_FLASH_ATTENTION: true
|
||||
FLASH_ATTENTION_RECOMPUTE: true
|
||||
host_ip: ${host_ip}
|
||||
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
|
||||
runtime: habana
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
ipc: host
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 100
|
||||
command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
|
||||
|
||||
llm-docsum-tgi:
|
||||
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
|
||||
container_name: docsum-gaudi-llm-server
|
||||
depends_on:
|
||||
tgi-gaudi-server:
|
||||
condition: service_healthy
|
||||
ports:
|
||||
- ${LLM_PORT:-9000}:9000
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
|
||||
MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
|
||||
LLM_ENDPOINT: ${LLM_ENDPOINT}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
|
||||
LOGFLAG: ${LOGFLAG:-False}
|
||||
restart: unless-stopped
|
||||
|
||||
whisper:
|
||||
image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
|
||||
container_name: docsum-gaudi-whisper-server
|
||||
ports:
|
||||
- "7066:7066"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HABANA_VISIBLE_DEVICES: all
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
runtime: habana
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
restart: unless-stopped
|
||||
|
||||
docsum-gaudi-backend-server:
|
||||
image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
|
||||
container_name: docsum-gaudi-backend-server
|
||||
depends_on:
|
||||
- tgi-gaudi-server
|
||||
- llm-docsum-tgi
|
||||
ports:
|
||||
- "${BACKEND_SERVICE_PORT:-8888}:8888"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
- http_proxy=${http_proxy}
|
||||
- MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
|
||||
- LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
|
||||
- ASR_SERVICE_HOST_IP=${ASR_SERVICE_HOST_IP}
|
||||
|
||||
ipc: host
|
||||
restart: always
|
||||
|
||||
docsum-gradio-ui:
|
||||
image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
|
||||
container_name: docsum-gaudi-ui-server
|
||||
depends_on:
|
||||
- docsum-gaudi-backend-server
|
||||
ports:
|
||||
- "${FRONTEND_SERVICE_PORT:-5173}:5173"
|
||||
environment:
|
||||
- no_proxy=${no_proxy}
|
||||
- https_proxy=${https_proxy}
|
||||
- http_proxy=${http_proxy}
|
||||
- BACKEND_SERVICE_ENDPOINT=${BACKEND_SERVICE_ENDPOINT}
|
||||
- DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
|
||||
ipc: host
|
||||
restart: always
|
||||
|
||||
networks:
|
||||
default:
|
||||
driver: bridge
|
||||
Reference in New Issue
Block a user