Refactor AudioQnA/MultiModalQnA/AvatarChatbot (#1310)

Signed-off-by: chensuyue <suyue.chen@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: chensuyue <suyue.chen@intel.com>
2024-12-31 12:47:30 +08:00
parent 250ffb8b66
commit cc1d97f816
43 changed files with 482 additions and 1102 deletions
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -14,27 +14,20 @@ cd GenAIComps
 ### 2. Build ASR Image

 ```bash
-docker build -t opea/whisper-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/dependency/Dockerfile.intel_hpu .
-
-
-docker build -t opea/asr:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile .
+docker build -t opea/whisper-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile.intel_hpu .
 ```

 ### 3. Build LLM Image

-```bash
-docker build --no-cache -t opea/llm-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/tgi/Dockerfile .
-```
+Intel Xeon optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/tgi-gaudi:2.0.6 (https://github.com/huggingface/tgi-gaudi)

 ### 4. Build TTS Image

 ```bash
-docker build -t opea/speecht5-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/speecht5/dependency/Dockerfile.intel_hpu .
-
-docker build -t opea/tts:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/speecht5/Dockerfile .
+docker build -t opea/speecht5-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile.intel_hpu .
 ```

-### 6. Build MegaService Docker Image
+### 5. Build MegaService Docker Image

 To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `audioqna.py` Python script. Build the MegaService Docker image using the command below:

@@ -47,11 +40,8 @@ docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_p
 Then run the command `docker images`, you will have following images ready:

 1. `opea/whisper-gaudi:latest`
-2. `opea/asr:latest`
-3. `opea/llm-tgi:latest`
-4. `opea/speecht5-gaudi:latest`
-5. `opea/tts:latest`
-6. `opea/audioqna:latest`
+2. `opea/speecht5-gaudi:latest`
+3. `opea/audioqna:latest`

 ## 🚀 Set the environment variables

@@ -61,20 +51,18 @@ Before starting the services with `docker compose`, you have to recheck the foll
 export host_ip=<your External Public IP>    # export host_ip=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=<your HF token>

-export TGI_LLM_ENDPOINT=http://$host_ip:3006
 export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3

-export ASR_ENDPOINT=http://$host_ip:7066
-export TTS_ENDPOINT=http://$host_ip:7055
-
 export MEGA_SERVICE_HOST_IP=${host_ip}
-export ASR_SERVICE_HOST_IP=${host_ip}
-export TTS_SERVICE_HOST_IP=${host_ip}
-export LLM_SERVICE_HOST_IP=${host_ip}
+export WHISPER_SERVER_HOST_IP=${host_ip}
+export SPEECHT5_SERVER_HOST_IP=${host_ip}
+export LLM_SERVER_HOST_IP=${host_ip}

-export ASR_SERVICE_PORT=3001
-export TTS_SERVICE_PORT=3002
-export LLM_SERVICE_PORT=3007
+export WHISPER_SERVER_PORT=7066
+export SPEECHT5_SERVER_PORT=7055
+export LLM_SERVER_PORT=3006
+
+export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
 ```

 ## 🚀 Start the MegaService
@@ -95,36 +83,18 @@ curl http://${host_ip}:7066/v1/asr \
  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
  -H 'Content-Type: application/json'

-# asr microservice
-curl http://${host_ip}:3001/v1/audio/transcriptions \
-  -X POST \
-  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
-  -H 'Content-Type: application/json'
-
 # tgi service
 curl http://${host_ip}:3006/generate \
  -X POST \
  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
  -H 'Content-Type: application/json'

-# llm microservice
-curl http://${host_ip}:3007/v1/chat/completions\
-  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
-  -H 'Content-Type: application/json'
-
 # speecht5 service
 curl http://${host_ip}:7055/v1/tts \
  -X POST \
  -d '{"text": "Who are you?"}' \
  -H 'Content-Type: application/json'

-# tts microservice
-curl http://${host_ip}:3002/v1/audio/speech \
-  -X POST \
-  -d '{"text": "Who are you?"}' \
-  -H 'Content-Type: application/json'
-
 ```

 ## 🚀 Test MegaService
@@ -134,8 +104,9 @@ base64 string to the megaservice endpoint. The megaservice will return a spoken
 to the response, decode the base64 string and save it as a .wav file.

 ```bash
+# voice can be "default" or "male"
 curl http://${host_ip}:3008/v1/audioqna \
  -X POST \
-  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64}' \
+  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64, "voice":"default"}' \
  -H 'Content-Type: application/json' | sed 's/^"//;s/"$//' | base64 -d > output.wav
 ```
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -18,14 +18,6 @@ services:
    cap_add:
      - SYS_NICE
    restart: unless-stopped
-  asr:
-    image: ${REGISTRY:-opea}/asr:${TAG:-latest}
-    container_name: asr-service
-    ports:
-      - "3001:9099"
-    ipc: host
-    environment:
-      ASR_ENDPOINT: ${ASR_ENDPOINT}
  speecht5-service:
    image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest}
    container_name: speecht5-service
@@ -42,14 +34,6 @@ services:
    cap_add:
      - SYS_NICE
    restart: unless-stopped
-  tts:
-    image: ${REGISTRY:-opea}/tts:${TAG:-latest}
-    container_name: tts-service
-    ports:
-      - "3002:9088"
-    ipc: host
-    environment:
-      TTS_ENDPOINT: ${TTS_ENDPOINT}
  tgi-service:
    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
    container_name: tgi-gaudi-server
@@ -75,28 +59,13 @@ services:
      - SYS_NICE
    ipc: host
    command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
-  llm:
-    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
-    container_name: llm-tgi-gaudi-server
-    depends_on:
-      - tgi-service
-    ports:
-      - "3007:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    restart: unless-stopped
  audioqna-gaudi-backend-server:
    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
    container_name: audioqna-gaudi-backend-server
    depends_on:
-      - asr
-      - llm
-      - tts
+      - whisper-service
+      - tgi-service
+      - speecht5-service
    ports:
      - "3008:8888"
    environment:
@@ -104,12 +73,12 @@ services:
      - https_proxy=${https_proxy}
      - http_proxy=${http_proxy}
      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - ASR_SERVICE_HOST_IP=${ASR_SERVICE_HOST_IP}
-      - ASR_SERVICE_PORT=${ASR_SERVICE_PORT}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
-      - LLM_SERVICE_PORT=${LLM_SERVICE_PORT}
-      - TTS_SERVICE_HOST_IP=${TTS_SERVICE_HOST_IP}
-      - TTS_SERVICE_PORT=${TTS_SERVICE_PORT}
+      - WHISPER_SERVER_HOST_IP=${WHISPER_SERVER_HOST_IP}
+      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
+      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
+      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
+      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
    ipc: host
    restart: always
  audioqna-gaudi-ui-server:
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -2,6 +2,21 @@

 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-pushd "../../../../../" > /dev/null
-source .set_env.sh
-popd > /dev/null
+
+# export host_ip=<your External Public IP>
+export host_ip=$(hostname -I | awk '{print $1}')
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+# <token>
+
+export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export WHISPER_SERVER_HOST_IP=${host_ip}
+export SPEECHT5_SERVER_HOST_IP=${host_ip}
+export LLM_SERVER_HOST_IP=${host_ip}
+
+export WHISPER_SERVER_PORT=7066
+export SPEECHT5_SERVER_PORT=7055
+export LLM_SERVER_PORT=3006
+
+export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna