[AudioQnA] Enable vLLM and set it as default LLM serving (#1657)

Signed-off-by: Wang, Kai Lawrence <kai.lawrence.wang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-03-14 09:56:33 +08:00
parent 35c5cf5de8
commit 8fe19291c8
16 changed files with 747 additions and 99 deletions
--- a/AudioQnA/audioqna.py
+++ b/AudioQnA/audioqna.py
@@ -16,7 +16,7 @@ SPEECHT5_SERVER_HOST_IP = os.getenv("SPEECHT5_SERVER_HOST_IP", "0.0.0.0")
 SPEECHT5_SERVER_PORT = int(os.getenv("SPEECHT5_SERVER_PORT", 7055))
 LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
 LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 3006))
-LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")
 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
--- a/AudioQnA/audioqna_multilang.py
+++ b/AudioQnA/audioqna_multilang.py
@@ -17,7 +17,7 @@ GPT_SOVITS_SERVER_HOST_IP = os.getenv("GPT_SOVITS_SERVER_HOST_IP", "0.0.0.0")
 GPT_SOVITS_SERVER_PORT = int(os.getenv("GPT_SOVITS_SERVER_PORT", 9088))
 LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
 LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 8888))
-LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")
 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
--- a/AudioQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/README.md
@@ -2,6 +2,10 @@
 This document outlines the deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server.
 The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [Start the MegaService](#-start-the-megaservice) section in this page.
 Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
 ## 🚀 Build Docker images
 ### 1. Source Code install GenAIComps
@@ -17,9 +21,15 @@ cd GenAIComps
 docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .
 ```
-### 3. Build LLM Image
+### 3. Build vLLM Image
-Intel Xeon optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu (https://github.com/huggingface/text-generation-inference)
+```bash
 git clone https://github.com/vllm-project/vllm.git
 cd ./vllm/
 VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
 git checkout ${VLLM_VER}
 docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.cpu -t opea/vllm:latest --shm-size=128g .
 ```
 ### 4. Build TTS Image
@@ -43,9 +53,10 @@ docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_p
 Then run the command `docker images`, you will have following images ready:
 1. `opea/whisper:latest`
-2. `opea/speecht5:latest`
+2. `opea/vllm:latest`
-3. `opea/audioqna:latest`
+3. `opea/speecht5:latest`
-4. `opea/gpt-sovits:latest` (optional)
+4. `opea/audioqna:latest`
 5. `opea/gpt-sovits:latest` (optional)
 ## 🚀 Set the environment variables
@@ -55,7 +66,7 @@ Before starting the services with `docker compose`, you have to recheck the foll
 export host_ip=<your External Public IP>    # export host_ip=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=<your HF token>
-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
 export MEGA_SERVICE_HOST_IP=${host_ip}
 export WHISPER_SERVER_HOST_IP=${host_ip}
@@ -73,40 +84,90 @@ export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
 or use set_env.sh file to setup environment variables.
-Note: Please replace with host_ip with your external IP address, do not use localhost.
+Note:
 - Please replace with host_ip with your external IP address, do not use localhost.
 - If you are in a proxy environment, also set the proxy-related environment variables:
 ```
 export http_proxy="Your_HTTP_Proxy"
 export https_proxy="Your_HTTPs_Proxy"
 # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
 export no_proxy="Your_No_Proxy",${host_ip},whisper-service,speecht5-service,gpt-sovits-service,tgi-service,vllm-service,audioqna-xeon-backend-server,audioqna-xeon-ui-server
 ```
 ## 🚀 Start the MegaService
 ```bash
 cd GenAIExamples/AudioQnA/docker_compose/intel/cpu/xeon/
 ```
 If use vLLM as the LLM serving backend:
 ```
 docker compose up -d
 # multilang tts (optional)
 docker compose -f compose_multilang.yaml up -d
 ```
 If use TGI as the LLM serving backend:
 ```
 docker compose -f compose_tgi.yaml up -d
 ```
 ## 🚀 Test MicroServices
-```bash
+1. Whisper Service
 # whisper service
 wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
 curl http://${host_ip}:7066/v1/audio/transcriptions \
  -H "Content-Type: multipart/form-data" \
  -F file="@./sample.wav" \
  -F model="openai/whisper-small"
-# tgi service
+   ```bash
-curl http://${host_ip}:3006/generate \
+   wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
-  -X POST \
+   curl http://${host_ip}:${WHISPER_SERVER_PORT}/v1/audio/transcriptions \
-  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
+     -H "Content-Type: multipart/form-data" \
-  -H 'Content-Type: application/json'
+     -F file="@./sample.wav" \
     -F model="openai/whisper-small"
   ```
-# speecht5 service
+2. LLM backend Service
 curl http://${host_ip}:7055/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
-# gpt-sovits service (optional)
+   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready and the container (`vllm-service` or `tgi-service`) status shown via `docker ps` will be `healthy`. Before that, the status will be `health: starting`.
-curl http://${host_ip}:9880/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
+
-```
+   Or try the command below to check whether the LLM serving is ready.
   ```bash
   # vLLM service
   docker logs vllm-service 2>&1 | grep complete
   # If the service is ready, you will get the response like below.
   INFO:     Application startup complete.
   ```
   ```bash
   # TGI service
   docker logs tgi-service | grep Connected
   # If the service is ready, you will get the response like below.
   2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
   ```
   Then try the `cURL` command below to validate services.
   ```bash
   # either vLLM or TGI service
   curl http://${host_ip}:${LLM_SERVER_PORT}/v1/chat/completions \
     -X POST \
     -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
     -H 'Content-Type: application/json'
   ```
 3. TTS Service
   ```
   # speecht5 service
   curl http://${host_ip}:${SPEECHT5_SERVER_PORT}/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
   # gpt-sovits service (optional)
   curl http://${host_ip}:${GPT_SOVITS_SERVER_PORT}/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
   ```
 ## 🚀 Test MegaService
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -6,7 +6,7 @@ services:
    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
    container_name: whisper-service
    ports:
-      - "7066:7066"
+      - ${WHISPER_SERVER_PORT:-7066}:7066
    ipc: host
    environment:
      no_proxy: ${no_proxy}
@@ -17,38 +17,41 @@ services:
    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
    container_name: speecht5-service
    ports:
-      - "7055:7055"
+      - ${SPEECHT5_SERVER_PORT:-7055}:7055
    ipc: host
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
    restart: unless-stopped
-  tgi-service:
+  vllm-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
-    container_name: tgi-service
+    container_name: vllm-service
    ports:
-      - "3006:80"
+      - ${LLM_SERVER_PORT:-3006}:80
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
-    shm_size: 1g
+    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      LLM_MODEL_ID: ${LLM_MODEL_ID}
      VLLM_TORCH_PROFILER_DIR: "/mnt"
      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://$host_ip:3006/health || exit 1"]
+      test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
-    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+    command: --model ${LLM_MODEL_ID} --host 0.0.0.0 --port 80
  audioqna-xeon-backend-server:
    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
    container_name: audioqna-xeon-backend-server
    depends_on:
      - whisper-service
-      - tgi-service
+      - vllm-service
      - speecht5-service
    ports:
      - "3008:8888"
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
@@ -6,7 +6,7 @@ services:
    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
    container_name: whisper-service
    ports:
-      - "7066:7066"
+      - ${WHISPER_SERVER_PORT:-7066}:7066
    ipc: host
    environment:
      no_proxy: ${no_proxy}
@@ -18,27 +18,35 @@ services:
    image: ${REGISTRY:-opea}/gpt-sovits:${TAG:-latest}
    container_name: gpt-sovits-service
    ports:
-      - "9880:9880"
+      - ${GPT_SOVITS_SERVER_PORT:-9880}:9880
    ipc: host
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
    restart: unless-stopped
-  tgi-service:
+  vllm-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
-    container_name: tgi-service
+    container_name: vllm-service
    ports:
-      - "3006:80"
+      - ${LLM_SERVER_PORT:-3006}:80
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
-    shm_size: 1g
+    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
      VLLM_TORCH_PROFILER_DIR: "/mnt"
      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
    command: --model ${LLM_MODEL_ID} --host 0.0.0.0 --port 80
  audioqna-xeon-backend-server:
    image: ${REGISTRY:-opea}/audioqna-multilang:${TAG:-latest}
    container_name: audioqna-xeon-backend-server
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -0,0 +1,87 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 services:
  whisper-service:
    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
    container_name: whisper-service
    ports:
      - ${WHISPER_SERVER_PORT:-7066}:7066
    ipc: host
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
    restart: unless-stopped
  speecht5-service:
    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
    container_name: speecht5-service
    ports:
      - ${SPEECHT5_SERVER_PORT:-7055}:7055
    ipc: host
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
    restart: unless-stopped
  tgi-service:
    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
    container_name: tgi-service
    ports:
      - ${LLM_SERVER_PORT:-3006}:80
    volumes:
      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
  audioqna-xeon-backend-server:
    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
    container_name: audioqna-xeon-backend-server
    depends_on:
      - whisper-service
      - tgi-service
      - speecht5-service
    ports:
      - "3008:8888"
    environment:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
      - http_proxy=${http_proxy}
      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
      - WHISPER_SERVER_HOST_IP=${WHISPER_SERVER_HOST_IP}
      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
      - LLM_MODEL_ID=${LLM_MODEL_ID}
      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
    ipc: host
    restart: always
  audioqna-xeon-ui-server:
    image: ${REGISTRY:-opea}/audioqna-ui:${TAG:-latest}
    container_name: audioqna-xeon-ui-server
    depends_on:
      - audioqna-xeon-backend-server
    ports:
      - "5173:5173"
    environment:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
      - http_proxy=${http_proxy}
      - CHAT_URL=${BACKEND_SERVICE_ENDPOINT}
    ipc: host
    restart: always
 networks:
  default:
    driver: bridge
--- a/AudioQnA/docker_compose/intel/cpu/xeon/set_env.sh
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/set_env.sh
@@ -8,7 +8,7 @@ export host_ip=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 # <token>
-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
 export MEGA_SERVICE_HOST_IP=${host_ip}
 export WHISPER_SERVER_HOST_IP=${host_ip}
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -2,6 +2,10 @@
 This document outlines the deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server.
 The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [Start the MegaService](#-start-the-megaservice) section in this page.
 Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
 ## 🚀 Build Docker images
 ### 1. Source Code install GenAIComps
@@ -17,9 +21,13 @@ cd GenAIComps
 docker build -t opea/whisper-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile.intel_hpu .
 ```
-### 3. Build LLM Image
+### 3. Build vLLM Image
-Intel Xeon optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/tgi-gaudi:2.0.6 (https://github.com/huggingface/tgi-gaudi)
+git clone https://github.com/HabanaAI/vllm-fork.git
 cd vllm-fork/
 VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
 git checkout ${VLLM_VER}
 docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g .
 ### 4. Build TTS Image
@@ -40,8 +48,9 @@ docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_p
 Then run the command `docker images`, you will have following images ready:
 1. `opea/whisper-gaudi:latest`
-2. `opea/speecht5-gaudi:latest`
+2. `opea/vllm-gaudi:latest`
-3. `opea/audioqna:latest`
+3. `opea/speecht5-gaudi:latest`
 4. `opea/audioqna:latest`
 ## 🚀 Set the environment variables
@@ -51,7 +60,12 @@ Before starting the services with `docker compose`, you have to recheck the foll
 export host_ip=<your External Public IP>    # export host_ip=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=<your HF token>
-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
 # set vLLM parameters
 export NUM_CARDS=1
 export BLOCK_SIZE=128
 export MAX_NUM_SEQS=256
 export MAX_SEQ_LEN_TO_CAPTURE=2048
 export MEGA_SERVICE_HOST_IP=${host_ip}
 export WHISPER_SERVER_HOST_IP=${host_ip}
@@ -65,37 +79,90 @@ export LLM_SERVER_PORT=3006
 export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
 ```
 or use set_env.sh file to setup environment variables.
 Note:
 - Please replace with host_ip with your external IP address, do not use localhost.
 - If you are in a proxy environment, also set the proxy-related environment variables:
 ```
 export http_proxy="Your_HTTP_Proxy"
 export https_proxy="Your_HTTPs_Proxy"
 # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
 export no_proxy="Your_No_Proxy",${host_ip},whisper-service,speecht5-service,tgi-service,vllm-service,audioqna-gaudi-backend-server,audioqna-gaudi-ui-server
 ```
 ## 🚀 Start the MegaService
 > **_NOTE:_** Users will need at least three Gaudi cards for AudioQnA.
 ```bash
 cd GenAIExamples/AudioQnA/docker_compose/intel/hpu/gaudi/
 ```
 If use vLLM as the LLM serving backend:
 ```
 docker compose up -d
 ```
 If use TGI as the LLM serving backend:
 ```
 docker compose -f compose_tgi.yaml up -d
 ```
 ## 🚀 Test MicroServices
-```bash
+1. Whisper Service
 # whisper service
 curl http://${host_ip}:7066/v1/asr \
  -X POST \
  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
  -H 'Content-Type: application/json'
-# tgi service
+   ```bash
-curl http://${host_ip}:3006/generate \
+   curl http://${host_ip}:${WHISPER_SERVER_PORT}/v1/asr \
-  -X POST \
+     -X POST \
-  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
+     -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
-  -H 'Content-Type: application/json'
+     -H 'Content-Type: application/json'
   ```
-# speecht5 service
+2. LLM backend Service
 curl http://${host_ip}:7055/v1/tts \
  -X POST \
  -d '{"text": "Who are you?"}' \
  -H 'Content-Type: application/json'
-```
+   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready and the container (`vllm-gaudi-service` or `tgi-gaudi-service`) status shown via `docker ps` will be `healthy`. Before that, the status will be `health: starting`.
   Or try the command below to check whether the LLM serving is ready.
   ```bash
   # vLLM service
   docker logs vllm-gaudi-service 2>&1 | grep complete
   # If the service is ready, you will get the response like below.
   INFO:     Application startup complete.
   ```
   ```bash
   # TGI service
   docker logs tgi-gaudi-service | grep Connected
   # If the service is ready, you will get the response like below.
   2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
   ```
   Then try the `cURL` command below to validate services.
   ```bash
   # either vLLM or TGI service
   curl http://${host_ip}:${LLM_SERVER_PORT}/v1/chat/completions \
     -X POST \
     -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
     -H 'Content-Type: application/json'
   ```
 3. TTS Service
   ```
   # speecht5 service
   curl http://${host_ip}:${SPEECHT5_SERVER_PORT}/v1/tts
     -X POST \
     -d '{"text": "Who are you?"}' \
     -H 'Content-Type: application/json'
   ```
 ## 🚀 Test MegaService
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -6,7 +6,7 @@ services:
    image: ${REGISTRY:-opea}/whisper-gaudi:${TAG:-latest}
    container_name: whisper-service
    ports:
-      - "7066:7066"
+      - ${WHISPER_SERVER_PORT:-7066}:7066
    ipc: host
    environment:
      no_proxy: ${no_proxy}
@@ -22,7 +22,7 @@ services:
    image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest}
    container_name: speecht5-service
    ports:
-      - "7055:7055"
+      - ${SPEECHT5_SERVER_PORT:-7055}:7055
    ipc: host
    environment:
      no_proxy: ${no_proxy}
@@ -34,28 +34,27 @@ services:
    cap_add:
      - SYS_NICE
    restart: unless-stopped
-  tgi-service:
+  vllm-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
-    container_name: tgi-gaudi-server
+    container_name: vllm-gaudi-service
    ports:
-      - "3006:80"
+      - ${LLM_SERVER_PORT:-3006}:80
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
-      LIMIT_HPU_GRAPH: true
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
-      USE_FLASH_ATTENTION: true
+      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
      FLASH_ATTENTION_RECOMPUTE: true
    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://$host_ip:3006/health || exit 1"]
+      test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
@@ -63,13 +62,13 @@ services:
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
  audioqna-gaudi-backend-server:
    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
    container_name: audioqna-gaudi-backend-server
    depends_on:
      - whisper-service
-      - tgi-service
+      - vllm-service
      - speecht5-service
    ports:
      - "3008:8888"
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -0,0 +1,108 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 services:
  whisper-service:
    image: ${REGISTRY:-opea}/whisper-gaudi:${TAG:-latest}
    container_name: whisper-service
    ports:
      - ${WHISPER_SERVER_PORT:-7066}:7066
    ipc: host
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
    runtime: habana
    cap_add:
      - SYS_NICE
    restart: unless-stopped
  speecht5-service:
    image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest}
    container_name: speecht5-service
    ports:
      - ${SPEECHT5_SERVER_PORT:-7055}:7055
    ipc: host
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
    runtime: habana
    cap_add:
      - SYS_NICE
    restart: unless-stopped
  tgi-service:
    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
    container_name: tgi-gaudi-service
    ports:
      - ${LLM_SERVER_PORT:-3006}:80
    volumes:
      - "${MODEL_CACHE:-./data}:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
      ENABLE_HPU_GRAPH: true
      LIMIT_HPU_GRAPH: true
      USE_FLASH_ATTENTION: true
      FLASH_ATTENTION_RECOMPUTE: true
      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
    runtime: habana
    cap_add:
      - SYS_NICE
    ipc: host
    command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
  audioqna-gaudi-backend-server:
    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
    container_name: audioqna-gaudi-backend-server
    depends_on:
      - whisper-service
      - tgi-service
      - speecht5-service
    ports:
      - "3008:8888"
    environment:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
      - http_proxy=${http_proxy}
      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
      - WHISPER_SERVER_HOST_IP=${WHISPER_SERVER_HOST_IP}
      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
      - LLM_MODEL_ID=${LLM_MODEL_ID}
      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
    ipc: host
    restart: always
  audioqna-gaudi-ui-server:
    image: ${REGISTRY:-opea}/audioqna-ui:${TAG:-latest}
    container_name: audioqna-gaudi-ui-server
    depends_on:
      - audioqna-gaudi-backend-server
    ports:
      - "5173:5173"
    environment:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
      - http_proxy=${http_proxy}
      - CHAT_URL=${BACKEND_SERVICE_ENDPOINT}
    ipc: host
    restart: always
 networks:
  default:
    driver: bridge
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -8,7 +8,13 @@ export host_ip=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 # <token>
-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
 # set vLLM parameters
 export NUM_CARDS=1
 export BLOCK_SIZE=128
 export MAX_NUM_SEQS=256
 export MAX_SEQ_LEN_TO_CAPTURE=2048
 export MEGA_SERVICE_HOST_IP=${host_ip}
 export WHISPER_SERVER_HOST_IP=${host_ip}
--- a/AudioQnA/docker_image_build/build.yaml
+++ b/AudioQnA/docker_image_build/build.yaml
@@ -71,3 +71,15 @@ services:
      dockerfile: comps/tts/src/integrations/dependency/gpt-sovits/Dockerfile
    extends: audioqna
    image: ${REGISTRY:-opea}/gpt-sovits:${TAG:-latest}
  vllm:
    build:
      context: vllm
      dockerfile: Dockerfile.cpu
    extends: audioqna
    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
  vllm-gaudi:
    build:
      context: vllm-fork
      dockerfile: Dockerfile.hpu
    extends: audioqna
    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
--- a/AudioQnA/tests/test_compose_on_gaudi.sh
+++ b/AudioQnA/tests/test_compose_on_gaudi.sh
@@ -31,18 +31,27 @@ function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
    git clone https://github.com/HabanaAI/vllm-fork.git
    cd vllm-fork/
    VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
    echo "Check out vLLM tag ${VLLM_VER}"
    git checkout ${VLLM_VER} &> /dev/null && cd ../
    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi"
+    service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi vllm-gaudi"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
    docker images && sleep 1s
 }
 function start_services() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+    export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
    export NUM_CARDS=1
    export BLOCK_SIZE=128
    export MAX_NUM_SEQS=256
    export MAX_SEQ_LEN_TO_CAPTURE=2048
    export MEGA_SERVICE_HOST_IP=${ip_address}
    export WHISPER_SERVER_HOST_IP=${ip_address}
@@ -61,8 +70,8 @@ function start_services() {
    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
    until [[ "$n" -ge 200 ]]; do
-       docker logs tgi-gaudi-server > $LOG_PATH/tgi_service_start.log
+       docker logs vllm-gaudi-service > $LOG_PATH/vllm_service_start.log 2>&1
-       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+       if grep -q complete $LOG_PATH/vllm_service_start.log; then
           break
       fi
       sleep 5s
@@ -86,7 +95,7 @@ function validate_megaservice() {
    # always print the log
    docker logs whisper-service > $LOG_PATH/whisper-service.log
    docker logs speecht5-service > $LOG_PATH/tts-service.log
-    docker logs tgi-gaudi-server > $LOG_PATH/tgi-gaudi-server.log
+    docker logs vllm-gaudi-service > $LOG_PATH/vllm-gaudi-service.log
    docker logs audioqna-gaudi-backend-server > $LOG_PATH/audioqna-gaudi-backend-server.log
    echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
@@ -126,7 +135,7 @@ function validate_megaservice() {
 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    docker compose stop && docker compose rm -f
+    docker compose -f compose.yaml stop && docker compose rm -f
 }
 function main() {
--- a/AudioQnA/tests/test_compose_on_xeon.sh
+++ b/AudioQnA/tests/test_compose_on_xeon.sh
@@ -31,18 +31,23 @@ function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
    git clone https://github.com/vllm-project/vllm.git
    cd ./vllm/
    VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
    echo "Check out vLLM tag ${VLLM_VER}"
    git checkout ${VLLM_VER} &> /dev/null && cd ../
    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="audioqna audioqna-ui whisper speecht5"
+    service_list="audioqna audioqna-ui whisper speecht5 vllm"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
    docker images && sleep 1s
 }
 function start_services() {
    cd $WORKPATH/docker_compose/intel/cpu/xeon/
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+    export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
    export MEGA_SERVICE_HOST_IP=${ip_address}
    export WHISPER_SERVER_HOST_IP=${ip_address}
@@ -62,8 +67,8 @@ function start_services() {
    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
    until [[ "$n" -ge 200 ]]; do
-       docker logs tgi-service > $LOG_PATH/tgi_service_start.log
+       docker logs vllm-service > $LOG_PATH/vllm_service_start.log 2>&1
-       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+       if grep -q complete $LOG_PATH/vllm_service_start.log; then
           break
       fi
       sleep 5s
@@ -77,7 +82,7 @@ function validate_megaservice() {
    # always print the log
    docker logs whisper-service > $LOG_PATH/whisper-service.log
    docker logs speecht5-service > $LOG_PATH/tts-service.log
-    docker logs tgi-service > $LOG_PATH/tgi-service.log
+    docker logs vllm-service > $LOG_PATH/vllm-service.log
    docker logs audioqna-xeon-backend-server > $LOG_PATH/audioqna-xeon-backend-server.log
    echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
@@ -117,7 +122,7 @@ function validate_megaservice() {
 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/cpu/xeon/
-    docker compose stop && docker compose rm -f
+    docker compose -f compose.yaml stop && docker compose rm -f
 }
 function main() {
--- a/AudioQnA/tests/test_compose_tgi_on_gaudi.sh
+++ b/AudioQnA/tests/test_compose_tgi_on_gaudi.sh
@@ -0,0 +1,146 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -e
 IMAGE_REPO=${IMAGE_REPO:-"opea"}
 IMAGE_TAG=${IMAGE_TAG:-"latest"}
 echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
 export MODEL_CACHE=${model_cache:-"./data"}
 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
 ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
    opea_branch=${opea_branch:-"main"}
    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
    if [[ "${opea_branch}" != "main" ]]; then
        cd $WORKPATH
        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
        find . -type f -name "Dockerfile*" | while read -r file; do
            echo "Processing file: $file"
            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
        done
    fi
    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
    docker images && sleep 1s
 }
 function start_services() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
    export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
    export MEGA_SERVICE_HOST_IP=${ip_address}
    export WHISPER_SERVER_HOST_IP=${ip_address}
    export SPEECHT5_SERVER_HOST_IP=${ip_address}
    export LLM_SERVER_HOST_IP=${ip_address}
    export WHISPER_SERVER_PORT=7066
    export SPEECHT5_SERVER_PORT=7055
    export LLM_SERVER_PORT=3006
    export BACKEND_SERVICE_ENDPOINT=http://${ip_address}:3008/v1/audioqna
    export host_ip=${ip_address}
    # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
    # Start Docker Containers
    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
    until [[ "$n" -ge 200 ]]; do
       docker logs tgi-gaudi-service > $LOG_PATH/tgi_service_start.log
       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
           break
       fi
       sleep 5s
       n=$((n+1))
    done
    n=0
    until [[ "$n" -ge 100 ]]; do
       docker logs whisper-service > $LOG_PATH/whisper_service_start.log
       if grep -q "Uvicorn server setup on port" $LOG_PATH/whisper_service_start.log; then
           break
       fi
       sleep 5s
       n=$((n+1))
    done
 }
 function validate_megaservice() {
    response=$(http_proxy="" curl http://${ip_address}:3008/v1/audioqna -XPOST -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64}' -H 'Content-Type: application/json')
    # always print the log
    docker logs whisper-service > $LOG_PATH/whisper-service.log
    docker logs speecht5-service > $LOG_PATH/tts-service.log
    docker logs tgi-gaudi-service > $LOG_PATH/tgi-gaudi-service.log
    docker logs audioqna-gaudi-backend-server > $LOG_PATH/audioqna-gaudi-backend-server.log
    echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
    if [[ $(file speech.mp3) == *"RIFF"* ]]; then
        echo "Result correct."
    else
        echo "Result wrong."
        exit 1
    fi
 }
 #function validate_frontend() {
 #    cd $WORKPATH/ui/svelte
 #    local conda_env_name="OPEA_e2e"
 #    export PATH=${HOME}/miniforge3/bin/:$PATH
 ##    conda remove -n ${conda_env_name} --all -y
 ##    conda create -n ${conda_env_name} python=3.12 -y
 #    source activate ${conda_env_name}
 #
 #    sed -i "s/localhost/$ip_address/g" playwright.config.ts
 #
 ##    conda install -c conda-forge nodejs=22.6.0 -y
 #    npm install && npm ci && npx playwright install --with-deps
 #    node -v && npm -v && pip list
 #
 #    exit_status=0
 #    npx playwright test || exit_status=$?
 #
 #    if [ $exit_status -ne 0 ]; then
 #        echo "[TEST INFO]: ---------frontend test failed---------"
 #        exit $exit_status
 #    else
 #        echo "[TEST INFO]: ---------frontend test passed---------"
 #    fi
 #}
 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
    docker compose -f compose_tgi.yaml stop && docker compose rm -f
 }
 function main() {
    stop_docker
    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
    start_services
    validate_megaservice
    # validate_frontend
    stop_docker
    echo y | docker system prune
 }
 main
--- a/AudioQnA/tests/test_compose_tgi_on_xeon.sh
+++ b/AudioQnA/tests/test_compose_tgi_on_xeon.sh
@@ -0,0 +1,137 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -e
 IMAGE_REPO=${IMAGE_REPO:-"opea"}
 IMAGE_TAG=${IMAGE_TAG:-"latest"}
 echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
 export MODEL_CACHE=${model_cache:-"./data"}
 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
 ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
    opea_branch=${opea_branch:-"main"}
    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
    if [[ "${opea_branch}" != "main" ]]; then
        cd $WORKPATH
        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
        find . -type f -name "Dockerfile*" | while read -r file; do
            echo "Processing file: $file"
            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
        done
    fi
    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    service_list="audioqna audioqna-ui whisper speecht5"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
    docker images && sleep 1s
 }
 function start_services() {
    cd $WORKPATH/docker_compose/intel/cpu/xeon/
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
    export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
    export MEGA_SERVICE_HOST_IP=${ip_address}
    export WHISPER_SERVER_HOST_IP=${ip_address}
    export SPEECHT5_SERVER_HOST_IP=${ip_address}
    export LLM_SERVER_HOST_IP=${ip_address}
    export WHISPER_SERVER_PORT=7066
    export SPEECHT5_SERVER_PORT=7055
    export LLM_SERVER_PORT=3006
    export BACKEND_SERVICE_ENDPOINT=http://${ip_address}:3008/v1/audioqna
    export host_ip=${ip_address}
    # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
    # Start Docker Containers
    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
    until [[ "$n" -ge 200 ]]; do
       docker logs tgi-service > $LOG_PATH/tgi_service_start.log
       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
           break
       fi
       sleep 5s
       n=$((n+1))
    done
 }
 function validate_megaservice() {
    response=$(http_proxy="" curl http://${ip_address}:3008/v1/audioqna -XPOST -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64}' -H 'Content-Type: application/json')
    # always print the log
    docker logs whisper-service > $LOG_PATH/whisper-service.log
    docker logs speecht5-service > $LOG_PATH/tts-service.log
    docker logs tgi-service > $LOG_PATH/tgi-service.log
    docker logs audioqna-xeon-backend-server > $LOG_PATH/audioqna-xeon-backend-server.log
    echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
    if [[ $(file speech.mp3) == *"RIFF"* ]]; then
        echo "Result correct."
    else
        echo "Result wrong."
        exit 1
    fi
 }
 #function validate_frontend() {
 #    cd $WORKPATH/ui/svelte
 #    local conda_env_name="OPEA_e2e"
 #    export PATH=${HOME}/miniforge3/bin/:$PATH
 ##    conda remove -n ${conda_env_name} --all -y
 ##    conda create -n ${conda_env_name} python=3.12 -y
 #    source activate ${conda_env_name}
 #
 #    sed -i "s/localhost/$ip_address/g" playwright.config.ts
 #
 ##    conda install -c conda-forge nodejs=22.6.0 -y
 #    npm install && npm ci && npx playwright install --with-deps
 #    node -v && npm -v && pip list
 #
 #    exit_status=0
 #    npx playwright test || exit_status=$?
 #
 #    if [ $exit_status -ne 0 ]; then
 #        echo "[TEST INFO]: ---------frontend test failed---------"
 #        exit $exit_status
 #    else
 #        echo "[TEST INFO]: ---------frontend test passed---------"
 #    fi
 #}
 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/cpu/xeon/
    docker compose -f compose_tgi.yaml stop && docker compose rm -f
 }
 function main() {
    stop_docker
    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
    start_services
    validate_megaservice
    # validate_frontend
    stop_docker
    echo y | docker system prune
 }
 main