Refactor AudioQnA/MultiModalQnA/AvatarChatbot (#1310)

Signed-off-by: chensuyue <suyue.chen@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: chensuyue <suyue.chen@intel.com>
2024-12-31 12:47:30 +08:00
parent 250ffb8b66
commit cc1d97f816
43 changed files with 482 additions and 1102 deletions
--- a/AudioQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/README.md
@@ -15,30 +15,20 @@ cd GenAIComps
 ### 2. Build ASR Image

 ```bash
-docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/dependency/Dockerfile .
-
-
-docker build -t opea/asr:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile .
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .
 ```

 ### 3. Build LLM Image

-```bash
-docker build --no-cache -t opea/llm-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/tgi/Dockerfile .
-```
-
-Note:
 For compose for ROCm example AMD optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm (https://github.com/huggingface/text-generation-inference)

 ### 4. Build TTS Image

 ```bash
-docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/speecht5/dependency/Dockerfile .
-
-docker build -t opea/tts:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/speecht5/Dockerfile .
+docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile .
 ```

-### 6. Build MegaService Docker Image
+### 5. Build MegaService Docker Image

 To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `audioqna.py` Python script. Build the MegaService Docker image using the command below:

@@ -51,11 +41,8 @@ docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_p
 Then run the command `docker images`, you will have following images ready:

 1. `opea/whisper:latest`
-2. `opea/asr:latest`
-3. `opea/llm-tgi:latest`
-4. `opea/speecht5:latest`
-5. `opea/tts:latest`
-6. `opea/audioqna:latest`
+2. `opea/speecht5:latest`
+3. `opea/audioqna:latest`

 ## 🚀 Set the environment variables

@@ -65,20 +52,18 @@ Before starting the services with `docker compose`, you have to recheck the foll
 export host_ip=<your External Public IP>    # export host_ip=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=<your HF token>

-export TGI_LLM_ENDPOINT=http://$host_ip:3006
 export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3

-export ASR_ENDPOINT=http://$host_ip:7066
-export TTS_ENDPOINT=http://$host_ip:7055
-
 export MEGA_SERVICE_HOST_IP=${host_ip}
-export ASR_SERVICE_HOST_IP=${host_ip}
-export TTS_SERVICE_HOST_IP=${host_ip}
-export LLM_SERVICE_HOST_IP=${host_ip}
+export WHISPER_SERVER_HOST_IP=${host_ip}
+export SPEECHT5_SERVER_HOST_IP=${host_ip}
+export LLM_SERVER_HOST_IP=${host_ip}

-export ASR_SERVICE_PORT=3001
-export TTS_SERVICE_PORT=3002
-export LLM_SERVICE_PORT=3007
+export WHISPER_SERVER_PORT=7066
+export SPEECHT5_SERVER_PORT=7055
+export LLM_SERVER_PORT=3006
+
+export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
 ```

 or use set_env.sh file to setup environment variables.
@@ -122,9 +107,10 @@ base64 string to the megaservice endpoint. The megaservice will return a spoken
 to the response, decode the base64 string and save it as a .wav file.

 ```bash
+# voice can be "default" or "male"
 curl http://${host_ip}:3008/v1/audioqna \
  -X POST \
-  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64}' \
+  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64, "voice":"default"}' \
  -H 'Content-Type: application/json' | sed 's/^"//;s/"$//' | base64 -d > output.wav
 ```

@@ -137,34 +123,15 @@ curl http://${host_ip}:7066/v1/asr \
  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
  -H 'Content-Type: application/json'

-# asr microservice
-curl http://${host_ip}:3001/v1/audio/transcriptions \
-  -X POST \
-  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
-  -H 'Content-Type: application/json'
-
 # tgi service
 curl http://${host_ip}:3006/generate \
  -X POST \
  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
  -H 'Content-Type: application/json'

-# llm microservice
-curl http://${host_ip}:3007/v1/chat/completions\
-  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
-  -H 'Content-Type: application/json'
-
 # speecht5 service
 curl http://${host_ip}:7055/v1/tts \
  -X POST \
  -d '{"text": "Who are you?"}' \
  -H 'Content-Type: application/json'
-
-# tts microservice
-curl http://${host_ip}:3002/v1/audio/speech \
-  -X POST \
-  -d '{"text": "Who are you?"}' \
-  -H 'Content-Type: application/json'
-
 ```
--- a/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -13,14 +13,6 @@ services:
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
    restart: unless-stopped
-  asr:
-    image: ${REGISTRY:-opea}/asr:${TAG:-latest}
-    container_name: asr-service
-    ports:
-      - "3001:9099"
-    ipc: host
-    environment:
-      ASR_ENDPOINT: ${ASR_ENDPOINT}
  speecht5-service:
    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
    container_name: speecht5-service
@@ -32,14 +24,6 @@ services:
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
    restart: unless-stopped
-  tts:
-    image: ${REGISTRY:-opea}/tts:${TAG:-latest}
-    container_name: tts-service
-    ports:
-      - "3002:9088"
-    ipc: host
-    environment:
-      TTS_ENDPOINT: ${TTS_ENDPOINT}
  tgi-service:
    image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
    container_name: tgi-service
@@ -67,28 +51,13 @@ services:
    security_opt:
      - seccomp:unconfined
    ipc: host
-  llm:
-    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
-    container_name: llm-tgi-server
-    depends_on:
-      - tgi-service
-    ports:
-      - "3007:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    restart: unless-stopped
  audioqna-backend-server:
    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
    container_name: audioqna-xeon-backend-server
    depends_on:
-      - asr
-      - llm
-      - tts
+      - whisper-service
+      - tgi-service
+      - speecht5-service
    ports:
      - "3008:8888"
    environment:
@@ -96,12 +65,12 @@ services:
      - https_proxy=${https_proxy}
      - http_proxy=${http_proxy}
      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - ASR_SERVICE_HOST_IP=${ASR_SERVICE_HOST_IP}
-      - ASR_SERVICE_PORT=${ASR_SERVICE_PORT}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
-      - LLM_SERVICE_PORT=${LLM_SERVICE_PORT}
-      - TTS_SERVICE_HOST_IP=${TTS_SERVICE_HOST_IP}
-      - TTS_SERVICE_PORT=${TTS_SERVICE_PORT}
+      - WHISPER_SERVER_HOST_IP=${WHISPER_SERVER_HOST_IP}
+      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
+      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
+      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
+      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
    ipc: host
    restart: always

--- a/AudioQnA/docker_compose/amd/gpu/rocm/set_env.sh
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/set_env.sh
@@ -10,17 +10,15 @@ export host_ip="192.165.1.21"
 export HUGGINGFACEHUB_API_TOKEN=${YOUR_HUGGINGFACEHUB_API_TOKEN}
 # <token>

-export TGI_LLM_ENDPOINT=http://$host_ip:3006
 export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3

-export ASR_ENDPOINT=http://$host_ip:7066
-export TTS_ENDPOINT=http://$host_ip:7055
-
 export MEGA_SERVICE_HOST_IP=${host_ip}
-export ASR_SERVICE_HOST_IP=${host_ip}
-export TTS_SERVICE_HOST_IP=${host_ip}
-export LLM_SERVICE_HOST_IP=${host_ip}
+export WHISPER_SERVER_HOST_IP=${host_ip}
+export SPEECHT5_SERVER_HOST_IP=${host_ip}
+export LLM_SERVER_HOST_IP=${host_ip}

-export ASR_SERVICE_PORT=3001
-export TTS_SERVICE_PORT=3002
-export LLM_SERVICE_PORT=3007
+export WHISPER_SERVER_PORT=7066
+export SPEECHT5_SERVER_PORT=7055
+export LLM_SERVER_PORT=3006
+
+export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna