[AudioQnA] Enable vLLM and set it as default LLM serving (#1657)

Signed-off-by: Wang, Kai Lawrence <kai.lawrence.wang@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Wang, Kai Lawrence
2025-03-14 09:56:33 +08:00
committed by GitHub
parent 35c5cf5de8
commit 8fe19291c8
16 changed files with 747 additions and 99 deletions

View File

@@ -16,7 +16,7 @@ SPEECHT5_SERVER_HOST_IP = os.getenv("SPEECHT5_SERVER_HOST_IP", "0.0.0.0")
SPEECHT5_SERVER_PORT = int(os.getenv("SPEECHT5_SERVER_PORT", 7055))
LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 3006))
LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")
def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):

View File

@@ -17,7 +17,7 @@ GPT_SOVITS_SERVER_HOST_IP = os.getenv("GPT_SOVITS_SERVER_HOST_IP", "0.0.0.0")
GPT_SOVITS_SERVER_PORT = int(os.getenv("GPT_SOVITS_SERVER_PORT", 9088))
LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 8888))
LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")
def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):

View File

@@ -2,6 +2,10 @@
This document outlines the deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server.
The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [Start the MegaService](#-start-the-megaservice) section in this page.
Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
## 🚀 Build Docker images
### 1. Source Code install GenAIComps
@@ -17,9 +21,15 @@ cd GenAIComps
docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .
```
### 3. Build LLM Image
### 3. Build vLLM Image
Intel Xeon optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu (https://github.com/huggingface/text-generation-inference)
```bash
git clone https://github.com/vllm-project/vllm.git
cd ./vllm/
VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
git checkout ${VLLM_VER}
docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.cpu -t opea/vllm:latest --shm-size=128g .
```
### 4. Build TTS Image
@@ -43,9 +53,10 @@ docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_p
Then run the command `docker images`, you will have following images ready:
1. `opea/whisper:latest`
2. `opea/speecht5:latest`
3. `opea/audioqna:latest`
4. `opea/gpt-sovits:latest` (optional)
2. `opea/vllm:latest`
3. `opea/speecht5:latest`
4. `opea/audioqna:latest`
5. `opea/gpt-sovits:latest` (optional)
## 🚀 Set the environment variables
@@ -55,7 +66,7 @@ Before starting the services with `docker compose`, you have to recheck the foll
export host_ip=<your External Public IP> # export host_ip=$(hostname -I | awk '{print $1}')
export HUGGINGFACEHUB_API_TOKEN=<your HF token>
export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
export MEGA_SERVICE_HOST_IP=${host_ip}
export WHISPER_SERVER_HOST_IP=${host_ip}
@@ -73,40 +84,90 @@ export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
or use set_env.sh file to setup environment variables.
Note: Please replace with host_ip with your external IP address, do not use localhost.
Note:
- Please replace with host_ip with your external IP address, do not use localhost.
- If you are in a proxy environment, also set the proxy-related environment variables:
```
export http_proxy="Your_HTTP_Proxy"
export https_proxy="Your_HTTPs_Proxy"
# Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
export no_proxy="Your_No_Proxy",${host_ip},whisper-service,speecht5-service,gpt-sovits-service,tgi-service,vllm-service,audioqna-xeon-backend-server,audioqna-xeon-ui-server
```
## 🚀 Start the MegaService
```bash
cd GenAIExamples/AudioQnA/docker_compose/intel/cpu/xeon/
```
If use vLLM as the LLM serving backend:
```
docker compose up -d
# multilang tts (optional)
docker compose -f compose_multilang.yaml up -d
```
If use TGI as the LLM serving backend:
```
docker compose -f compose_tgi.yaml up -d
```
## 🚀 Test MicroServices
```bash
# whisper service
wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
curl http://${host_ip}:7066/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F file="@./sample.wav" \
-F model="openai/whisper-small"
1. Whisper Service
# tgi service
curl http://${host_ip}:3006/generate \
-X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-H 'Content-Type: application/json'
```bash
wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
curl http://${host_ip}:${WHISPER_SERVER_PORT}/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F file="@./sample.wav" \
-F model="openai/whisper-small"
```
# speecht5 service
curl http://${host_ip}:7055/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
2. LLM backend Service
# gpt-sovits service (optional)
curl http://${host_ip}:9880/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
```
In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready and the container (`vllm-service` or `tgi-service`) status shown via `docker ps` will be `healthy`. Before that, the status will be `health: starting`.
Or try the command below to check whether the LLM serving is ready.
```bash
# vLLM service
docker logs vllm-service 2>&1 | grep complete
# If the service is ready, you will get the response like below.
INFO: Application startup complete.
```
```bash
# TGI service
docker logs tgi-service | grep Connected
# If the service is ready, you will get the response like below.
2024-09-03T02:47:53.402023Z INFO text_generation_router::server: router/src/server.rs:2311: Connected
```
Then try the `cURL` command below to validate services.
```bash
# either vLLM or TGI service
curl http://${host_ip}:${LLM_SERVER_PORT}/v1/chat/completions \
-X POST \
-d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
-H 'Content-Type: application/json'
```
3. TTS Service
```
# speecht5 service
curl http://${host_ip}:${SPEECHT5_SERVER_PORT}/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
# gpt-sovits service (optional)
curl http://${host_ip}:${GPT_SOVITS_SERVER_PORT}/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
```
## 🚀 Test MegaService

View File

@@ -6,7 +6,7 @@ services:
image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
container_name: whisper-service
ports:
- "7066:7066"
- ${WHISPER_SERVER_PORT:-7066}:7066
ipc: host
environment:
no_proxy: ${no_proxy}
@@ -17,38 +17,41 @@ services:
image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
container_name: speecht5-service
ports:
- "7055:7055"
- ${SPEECHT5_SERVER_PORT:-7055}:7055
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
restart: unless-stopped
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
container_name: tgi-service
vllm-service:
image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
container_name: vllm-service
ports:
- "3006:80"
- ${LLM_SERVER_PORT:-3006}:80
volumes:
- "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
- "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
LLM_MODEL_ID: ${LLM_MODEL_ID}
VLLM_TORCH_PROFILER_DIR: "/mnt"
LLM_SERVER_PORT: ${LLM_SERVER_PORT}
healthcheck:
test: ["CMD-SHELL", "curl -f http://$host_ip:3006/health || exit 1"]
test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
command: --model ${LLM_MODEL_ID} --host 0.0.0.0 --port 80
audioqna-xeon-backend-server:
image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
container_name: audioqna-xeon-backend-server
depends_on:
- whisper-service
- tgi-service
- vllm-service
- speecht5-service
ports:
- "3008:8888"

View File

@@ -6,7 +6,7 @@ services:
image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
container_name: whisper-service
ports:
- "7066:7066"
- ${WHISPER_SERVER_PORT:-7066}:7066
ipc: host
environment:
no_proxy: ${no_proxy}
@@ -18,27 +18,35 @@ services:
image: ${REGISTRY:-opea}/gpt-sovits:${TAG:-latest}
container_name: gpt-sovits-service
ports:
- "9880:9880"
- ${GPT_SOVITS_SERVER_PORT:-9880}:9880
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
restart: unless-stopped
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
container_name: tgi-service
vllm-service:
image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
container_name: vllm-service
ports:
- "3006:80"
- ${LLM_SERVER_PORT:-3006}:80
volumes:
- "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
- "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
LLM_MODEL_ID: ${LLM_MODEL_ID}
VLLM_TORCH_PROFILER_DIR: "/mnt"
LLM_SERVER_PORT: ${LLM_SERVER_PORT}
healthcheck:
test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
command: --model ${LLM_MODEL_ID} --host 0.0.0.0 --port 80
audioqna-xeon-backend-server:
image: ${REGISTRY:-opea}/audioqna-multilang:${TAG:-latest}
container_name: audioqna-xeon-backend-server

View File

@@ -0,0 +1,87 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
services:
whisper-service:
image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
container_name: whisper-service
ports:
- ${WHISPER_SERVER_PORT:-7066}:7066
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
restart: unless-stopped
speecht5-service:
image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
container_name: speecht5-service
ports:
- ${SPEECHT5_SERVER_PORT:-7055}:7055
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
restart: unless-stopped
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
container_name: tgi-service
ports:
- ${LLM_SERVER_PORT:-3006}:80
volumes:
- "${MODEL_CACHE:-./data}:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
LLM_SERVER_PORT: ${LLM_SERVER_PORT}
healthcheck:
test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
audioqna-xeon-backend-server:
image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
container_name: audioqna-xeon-backend-server
depends_on:
- whisper-service
- tgi-service
- speecht5-service
ports:
- "3008:8888"
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
- http_proxy=${http_proxy}
- MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
- WHISPER_SERVER_HOST_IP=${WHISPER_SERVER_HOST_IP}
- WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
- LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
- LLM_SERVER_PORT=${LLM_SERVER_PORT}
- LLM_MODEL_ID=${LLM_MODEL_ID}
- SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
- SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
ipc: host
restart: always
audioqna-xeon-ui-server:
image: ${REGISTRY:-opea}/audioqna-ui:${TAG:-latest}
container_name: audioqna-xeon-ui-server
depends_on:
- audioqna-xeon-backend-server
ports:
- "5173:5173"
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
- http_proxy=${http_proxy}
- CHAT_URL=${BACKEND_SERVICE_ENDPOINT}
ipc: host
restart: always
networks:
default:
driver: bridge

View File

@@ -8,7 +8,7 @@ export host_ip=$(hostname -I | awk '{print $1}')
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
# <token>
export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
export MEGA_SERVICE_HOST_IP=${host_ip}
export WHISPER_SERVER_HOST_IP=${host_ip}

View File

@@ -2,6 +2,10 @@
This document outlines the deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server.
The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [Start the MegaService](#-start-the-megaservice) section in this page.
Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
## 🚀 Build Docker images
### 1. Source Code install GenAIComps
@@ -17,9 +21,13 @@ cd GenAIComps
docker build -t opea/whisper-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile.intel_hpu .
```
### 3. Build LLM Image
### 3. Build vLLM Image
Intel Xeon optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/tgi-gaudi:2.0.6 (https://github.com/huggingface/tgi-gaudi)
git clone https://github.com/HabanaAI/vllm-fork.git
cd vllm-fork/
VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
git checkout ${VLLM_VER}
docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g .
### 4. Build TTS Image
@@ -40,8 +48,9 @@ docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_p
Then run the command `docker images`, you will have following images ready:
1. `opea/whisper-gaudi:latest`
2. `opea/speecht5-gaudi:latest`
3. `opea/audioqna:latest`
2. `opea/vllm-gaudi:latest`
3. `opea/speecht5-gaudi:latest`
4. `opea/audioqna:latest`
## 🚀 Set the environment variables
@@ -51,7 +60,12 @@ Before starting the services with `docker compose`, you have to recheck the foll
export host_ip=<your External Public IP> # export host_ip=$(hostname -I | awk '{print $1}')
export HUGGINGFACEHUB_API_TOKEN=<your HF token>
export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
# set vLLM parameters
export NUM_CARDS=1
export BLOCK_SIZE=128
export MAX_NUM_SEQS=256
export MAX_SEQ_LEN_TO_CAPTURE=2048
export MEGA_SERVICE_HOST_IP=${host_ip}
export WHISPER_SERVER_HOST_IP=${host_ip}
@@ -65,37 +79,90 @@ export LLM_SERVER_PORT=3006
export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
```
or use set_env.sh file to setup environment variables.
Note:
- Please replace with host_ip with your external IP address, do not use localhost.
- If you are in a proxy environment, also set the proxy-related environment variables:
```
export http_proxy="Your_HTTP_Proxy"
export https_proxy="Your_HTTPs_Proxy"
# Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
export no_proxy="Your_No_Proxy",${host_ip},whisper-service,speecht5-service,tgi-service,vllm-service,audioqna-gaudi-backend-server,audioqna-gaudi-ui-server
```
## 🚀 Start the MegaService
> **_NOTE:_** Users will need at least three Gaudi cards for AudioQnA.
```bash
cd GenAIExamples/AudioQnA/docker_compose/intel/hpu/gaudi/
```
If use vLLM as the LLM serving backend:
```
docker compose up -d
```
If use TGI as the LLM serving backend:
```
docker compose -f compose_tgi.yaml up -d
```
## 🚀 Test MicroServices
```bash
# whisper service
curl http://${host_ip}:7066/v1/asr \
-X POST \
-d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
-H 'Content-Type: application/json'
1. Whisper Service
# tgi service
curl http://${host_ip}:3006/generate \
-X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-H 'Content-Type: application/json'
```bash
curl http://${host_ip}:${WHISPER_SERVER_PORT}/v1/asr \
-X POST \
-d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
-H 'Content-Type: application/json'
```
# speecht5 service
curl http://${host_ip}:7055/v1/tts \
-X POST \
-d '{"text": "Who are you?"}' \
-H 'Content-Type: application/json'
2. LLM backend Service
```
In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready and the container (`vllm-gaudi-service` or `tgi-gaudi-service`) status shown via `docker ps` will be `healthy`. Before that, the status will be `health: starting`.
Or try the command below to check whether the LLM serving is ready.
```bash
# vLLM service
docker logs vllm-gaudi-service 2>&1 | grep complete
# If the service is ready, you will get the response like below.
INFO: Application startup complete.
```
```bash
# TGI service
docker logs tgi-gaudi-service | grep Connected
# If the service is ready, you will get the response like below.
2024-09-03T02:47:53.402023Z INFO text_generation_router::server: router/src/server.rs:2311: Connected
```
Then try the `cURL` command below to validate services.
```bash
# either vLLM or TGI service
curl http://${host_ip}:${LLM_SERVER_PORT}/v1/chat/completions \
-X POST \
-d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
-H 'Content-Type: application/json'
```
3. TTS Service
```
# speecht5 service
curl http://${host_ip}:${SPEECHT5_SERVER_PORT}/v1/tts
-X POST \
-d '{"text": "Who are you?"}' \
-H 'Content-Type: application/json'
```
## 🚀 Test MegaService

View File

@@ -6,7 +6,7 @@ services:
image: ${REGISTRY:-opea}/whisper-gaudi:${TAG:-latest}
container_name: whisper-service
ports:
- "7066:7066"
- ${WHISPER_SERVER_PORT:-7066}:7066
ipc: host
environment:
no_proxy: ${no_proxy}
@@ -22,7 +22,7 @@ services:
image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest}
container_name: speecht5-service
ports:
- "7055:7055"
- ${SPEECHT5_SERVER_PORT:-7055}:7055
ipc: host
environment:
no_proxy: ${no_proxy}
@@ -34,28 +34,27 @@ services:
cap_add:
- SYS_NICE
restart: unless-stopped
tgi-service:
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
container_name: tgi-gaudi-server
vllm-service:
image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
container_name: vllm-gaudi-service
ports:
- "3006:80"
- ${LLM_SERVER_PORT:-3006}:80
volumes:
- "${MODEL_CACHE:-./data}:/data"
- "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
LLM_MODEL_ID: ${LLM_MODEL_ID}
VLLM_TORCH_PROFILER_DIR: "/mnt"
LLM_SERVER_PORT: ${LLM_SERVER_PORT}
healthcheck:
test: ["CMD-SHELL", "curl -f http://$host_ip:3006/health || exit 1"]
test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
@@ -63,13 +62,13 @@ services:
cap_add:
- SYS_NICE
ipc: host
command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
audioqna-gaudi-backend-server:
image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
container_name: audioqna-gaudi-backend-server
depends_on:
- whisper-service
- tgi-service
- vllm-service
- speecht5-service
ports:
- "3008:8888"

View File

@@ -0,0 +1,108 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
services:
whisper-service:
image: ${REGISTRY:-opea}/whisper-gaudi:${TAG:-latest}
container_name: whisper-service
ports:
- ${WHISPER_SERVER_PORT:-7066}:7066
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
runtime: habana
cap_add:
- SYS_NICE
restart: unless-stopped
speecht5-service:
image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest}
container_name: speecht5-service
ports:
- ${SPEECHT5_SERVER_PORT:-7055}:7055
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
runtime: habana
cap_add:
- SYS_NICE
restart: unless-stopped
tgi-service:
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
container_name: tgi-gaudi-service
ports:
- ${LLM_SERVER_PORT:-3006}:80
volumes:
- "${MODEL_CACHE:-./data}:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
LLM_SERVER_PORT: ${LLM_SERVER_PORT}
healthcheck:
test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
runtime: habana
cap_add:
- SYS_NICE
ipc: host
command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
audioqna-gaudi-backend-server:
image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
container_name: audioqna-gaudi-backend-server
depends_on:
- whisper-service
- tgi-service
- speecht5-service
ports:
- "3008:8888"
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
- http_proxy=${http_proxy}
- MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
- WHISPER_SERVER_HOST_IP=${WHISPER_SERVER_HOST_IP}
- WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
- LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
- LLM_SERVER_PORT=${LLM_SERVER_PORT}
- LLM_MODEL_ID=${LLM_MODEL_ID}
- SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
- SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
ipc: host
restart: always
audioqna-gaudi-ui-server:
image: ${REGISTRY:-opea}/audioqna-ui:${TAG:-latest}
container_name: audioqna-gaudi-ui-server
depends_on:
- audioqna-gaudi-backend-server
ports:
- "5173:5173"
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
- http_proxy=${http_proxy}
- CHAT_URL=${BACKEND_SERVICE_ENDPOINT}
ipc: host
restart: always
networks:
default:
driver: bridge

View File

@@ -8,7 +8,13 @@ export host_ip=$(hostname -I | awk '{print $1}')
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
# <token>
export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
# set vLLM parameters
export NUM_CARDS=1
export BLOCK_SIZE=128
export MAX_NUM_SEQS=256
export MAX_SEQ_LEN_TO_CAPTURE=2048
export MEGA_SERVICE_HOST_IP=${host_ip}
export WHISPER_SERVER_HOST_IP=${host_ip}

View File

@@ -71,3 +71,15 @@ services:
dockerfile: comps/tts/src/integrations/dependency/gpt-sovits/Dockerfile
extends: audioqna
image: ${REGISTRY:-opea}/gpt-sovits:${TAG:-latest}
vllm:
build:
context: vllm
dockerfile: Dockerfile.cpu
extends: audioqna
image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
vllm-gaudi:
build:
context: vllm-fork
dockerfile: Dockerfile.hpu
extends: audioqna
image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}

View File

@@ -31,18 +31,27 @@ function build_docker_images() {
cd $WORKPATH/docker_image_build
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
git clone https://github.com/HabanaAI/vllm-fork.git
cd vllm-fork/
VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
echo "Check out vLLM tag ${VLLM_VER}"
git checkout ${VLLM_VER} &> /dev/null && cd ../
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi"
service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi vllm-gaudi"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
docker images && sleep 1s
}
function start_services() {
cd $WORKPATH/docker_compose/intel/hpu/gaudi
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
export NUM_CARDS=1
export BLOCK_SIZE=128
export MAX_NUM_SEQS=256
export MAX_SEQ_LEN_TO_CAPTURE=2048
export MEGA_SERVICE_HOST_IP=${ip_address}
export WHISPER_SERVER_HOST_IP=${ip_address}
@@ -61,8 +70,8 @@ function start_services() {
docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
n=0
until [[ "$n" -ge 200 ]]; do
docker logs tgi-gaudi-server > $LOG_PATH/tgi_service_start.log
if grep -q Connected $LOG_PATH/tgi_service_start.log; then
docker logs vllm-gaudi-service > $LOG_PATH/vllm_service_start.log 2>&1
if grep -q complete $LOG_PATH/vllm_service_start.log; then
break
fi
sleep 5s
@@ -86,7 +95,7 @@ function validate_megaservice() {
# always print the log
docker logs whisper-service > $LOG_PATH/whisper-service.log
docker logs speecht5-service > $LOG_PATH/tts-service.log
docker logs tgi-gaudi-server > $LOG_PATH/tgi-gaudi-server.log
docker logs vllm-gaudi-service > $LOG_PATH/vllm-gaudi-service.log
docker logs audioqna-gaudi-backend-server > $LOG_PATH/audioqna-gaudi-backend-server.log
echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
@@ -126,7 +135,7 @@ function validate_megaservice() {
function stop_docker() {
cd $WORKPATH/docker_compose/intel/hpu/gaudi
docker compose stop && docker compose rm -f
docker compose -f compose.yaml stop && docker compose rm -f
}
function main() {

View File

@@ -31,18 +31,23 @@ function build_docker_images() {
cd $WORKPATH/docker_image_build
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
git clone https://github.com/vllm-project/vllm.git
cd ./vllm/
VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
echo "Check out vLLM tag ${VLLM_VER}"
git checkout ${VLLM_VER} &> /dev/null && cd ../
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
service_list="audioqna audioqna-ui whisper speecht5"
service_list="audioqna audioqna-ui whisper speecht5 vllm"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
docker images && sleep 1s
}
function start_services() {
cd $WORKPATH/docker_compose/intel/cpu/xeon/
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
export MEGA_SERVICE_HOST_IP=${ip_address}
export WHISPER_SERVER_HOST_IP=${ip_address}
@@ -62,8 +67,8 @@ function start_services() {
docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
n=0
until [[ "$n" -ge 200 ]]; do
docker logs tgi-service > $LOG_PATH/tgi_service_start.log
if grep -q Connected $LOG_PATH/tgi_service_start.log; then
docker logs vllm-service > $LOG_PATH/vllm_service_start.log 2>&1
if grep -q complete $LOG_PATH/vllm_service_start.log; then
break
fi
sleep 5s
@@ -77,7 +82,7 @@ function validate_megaservice() {
# always print the log
docker logs whisper-service > $LOG_PATH/whisper-service.log
docker logs speecht5-service > $LOG_PATH/tts-service.log
docker logs tgi-service > $LOG_PATH/tgi-service.log
docker logs vllm-service > $LOG_PATH/vllm-service.log
docker logs audioqna-xeon-backend-server > $LOG_PATH/audioqna-xeon-backend-server.log
echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
@@ -117,7 +122,7 @@ function validate_megaservice() {
function stop_docker() {
cd $WORKPATH/docker_compose/intel/cpu/xeon/
docker compose stop && docker compose rm -f
docker compose -f compose.yaml stop && docker compose rm -f
}
function main() {

View File

@@ -0,0 +1,146 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -e
IMAGE_REPO=${IMAGE_REPO:-"opea"}
IMAGE_TAG=${IMAGE_TAG:-"latest"}
echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
ip_address=$(hostname -I | awk '{print $1}')
function build_docker_images() {
opea_branch=${opea_branch:-"main"}
# If the opea_branch isn't main, replace the git clone branch in Dockerfile.
if [[ "${opea_branch}" != "main" ]]; then
cd $WORKPATH
OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
find . -type f -name "Dockerfile*" | while read -r file; do
echo "Processing file: $file"
sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
done
fi
cd $WORKPATH/docker_image_build
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
docker images && sleep 1s
}
function start_services() {
cd $WORKPATH/docker_compose/intel/hpu/gaudi
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
export MEGA_SERVICE_HOST_IP=${ip_address}
export WHISPER_SERVER_HOST_IP=${ip_address}
export SPEECHT5_SERVER_HOST_IP=${ip_address}
export LLM_SERVER_HOST_IP=${ip_address}
export WHISPER_SERVER_PORT=7066
export SPEECHT5_SERVER_PORT=7055
export LLM_SERVER_PORT=3006
export BACKEND_SERVICE_ENDPOINT=http://${ip_address}:3008/v1/audioqna
export host_ip=${ip_address}
# sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
# Start Docker Containers
docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
n=0
until [[ "$n" -ge 200 ]]; do
docker logs tgi-gaudi-service > $LOG_PATH/tgi_service_start.log
if grep -q Connected $LOG_PATH/tgi_service_start.log; then
break
fi
sleep 5s
n=$((n+1))
done
n=0
until [[ "$n" -ge 100 ]]; do
docker logs whisper-service > $LOG_PATH/whisper_service_start.log
if grep -q "Uvicorn server setup on port" $LOG_PATH/whisper_service_start.log; then
break
fi
sleep 5s
n=$((n+1))
done
}
function validate_megaservice() {
response=$(http_proxy="" curl http://${ip_address}:3008/v1/audioqna -XPOST -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64}' -H 'Content-Type: application/json')
# always print the log
docker logs whisper-service > $LOG_PATH/whisper-service.log
docker logs speecht5-service > $LOG_PATH/tts-service.log
docker logs tgi-gaudi-service > $LOG_PATH/tgi-gaudi-service.log
docker logs audioqna-gaudi-backend-server > $LOG_PATH/audioqna-gaudi-backend-server.log
echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
if [[ $(file speech.mp3) == *"RIFF"* ]]; then
echo "Result correct."
else
echo "Result wrong."
exit 1
fi
}
#function validate_frontend() {
# cd $WORKPATH/ui/svelte
# local conda_env_name="OPEA_e2e"
# export PATH=${HOME}/miniforge3/bin/:$PATH
## conda remove -n ${conda_env_name} --all -y
## conda create -n ${conda_env_name} python=3.12 -y
# source activate ${conda_env_name}
#
# sed -i "s/localhost/$ip_address/g" playwright.config.ts
#
## conda install -c conda-forge nodejs=22.6.0 -y
# npm install && npm ci && npx playwright install --with-deps
# node -v && npm -v && pip list
#
# exit_status=0
# npx playwright test || exit_status=$?
#
# if [ $exit_status -ne 0 ]; then
# echo "[TEST INFO]: ---------frontend test failed---------"
# exit $exit_status
# else
# echo "[TEST INFO]: ---------frontend test passed---------"
# fi
#}
function stop_docker() {
cd $WORKPATH/docker_compose/intel/hpu/gaudi
docker compose -f compose_tgi.yaml stop && docker compose rm -f
}
function main() {
stop_docker
if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
start_services
validate_megaservice
# validate_frontend
stop_docker
echo y | docker system prune
}
main

View File

@@ -0,0 +1,137 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -e
IMAGE_REPO=${IMAGE_REPO:-"opea"}
IMAGE_TAG=${IMAGE_TAG:-"latest"}
echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
export REGISTRY=${IMAGE_REPO}
export TAG=${IMAGE_TAG}
export MODEL_CACHE=${model_cache:-"./data"}
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
ip_address=$(hostname -I | awk '{print $1}')
function build_docker_images() {
opea_branch=${opea_branch:-"main"}
# If the opea_branch isn't main, replace the git clone branch in Dockerfile.
if [[ "${opea_branch}" != "main" ]]; then
cd $WORKPATH
OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
find . -type f -name "Dockerfile*" | while read -r file; do
echo "Processing file: $file"
sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
done
fi
cd $WORKPATH/docker_image_build
git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
echo "Build all the images with --no-cache, check docker_image_build.log for details..."
service_list="audioqna audioqna-ui whisper speecht5"
docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
docker images && sleep 1s
}
function start_services() {
cd $WORKPATH/docker_compose/intel/cpu/xeon/
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
export MEGA_SERVICE_HOST_IP=${ip_address}
export WHISPER_SERVER_HOST_IP=${ip_address}
export SPEECHT5_SERVER_HOST_IP=${ip_address}
export LLM_SERVER_HOST_IP=${ip_address}
export WHISPER_SERVER_PORT=7066
export SPEECHT5_SERVER_PORT=7055
export LLM_SERVER_PORT=3006
export BACKEND_SERVICE_ENDPOINT=http://${ip_address}:3008/v1/audioqna
export host_ip=${ip_address}
# sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
# Start Docker Containers
docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
n=0
until [[ "$n" -ge 200 ]]; do
docker logs tgi-service > $LOG_PATH/tgi_service_start.log
if grep -q Connected $LOG_PATH/tgi_service_start.log; then
break
fi
sleep 5s
n=$((n+1))
done
}
function validate_megaservice() {
response=$(http_proxy="" curl http://${ip_address}:3008/v1/audioqna -XPOST -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64}' -H 'Content-Type: application/json')
# always print the log
docker logs whisper-service > $LOG_PATH/whisper-service.log
docker logs speecht5-service > $LOG_PATH/tts-service.log
docker logs tgi-service > $LOG_PATH/tgi-service.log
docker logs audioqna-xeon-backend-server > $LOG_PATH/audioqna-xeon-backend-server.log
echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
if [[ $(file speech.mp3) == *"RIFF"* ]]; then
echo "Result correct."
else
echo "Result wrong."
exit 1
fi
}
#function validate_frontend() {
# cd $WORKPATH/ui/svelte
# local conda_env_name="OPEA_e2e"
# export PATH=${HOME}/miniforge3/bin/:$PATH
## conda remove -n ${conda_env_name} --all -y
## conda create -n ${conda_env_name} python=3.12 -y
# source activate ${conda_env_name}
#
# sed -i "s/localhost/$ip_address/g" playwright.config.ts
#
## conda install -c conda-forge nodejs=22.6.0 -y
# npm install && npm ci && npx playwright install --with-deps
# node -v && npm -v && pip list
#
# exit_status=0
# npx playwright test || exit_status=$?
#
# if [ $exit_status -ne 0 ]; then
# echo "[TEST INFO]: ---------frontend test failed---------"
# exit $exit_status
# else
# echo "[TEST INFO]: ---------frontend test passed---------"
# fi
#}
function stop_docker() {
cd $WORKPATH/docker_compose/intel/cpu/xeon/
docker compose -f compose_tgi.yaml stop && docker compose rm -f
}
function main() {
stop_docker
if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
start_services
validate_megaservice
# validate_frontend
stop_docker
echo y | docker system prune
}
main