Refactor asr/tts components (#1083)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-12-31 12:03:10 +08:00
parent 1040875055
commit a19c222636
46 changed files with 759 additions and 268 deletions
--- a/comps/asr/deployment/docker_compose/compose_whisper.yaml
+++ b/comps/asr/deployment/docker_compose/compose_whisper.yaml
@@ -0,0 +1,35 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  whisper-service:
+    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+    container_name: whisper-service
+    ports:
+      - "7066:7066"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:7066/health"]
+      interval: 10s
+      timeout: 6s
+      retries: 18
+  asr:
+    image: ${REGISTRY:-opea}/asr:${TAG:-latest}
+    container_name: asr-service
+    ports:
+      - "9099:9099"
+    ipc: host
+    environment:
+      ASR_ENDPOINT: ${ASR_ENDPOINT}
+    dependes_on:
+      speecht5-service:
+      condition: service_healthy
+
+networks:
+  default:
+    driver: bridge
--- a/comps/asr/deployment/docker_compose/compose_whisper_hpu.yaml
+++ b/comps/asr/deployment/docker_compose/compose_whisper_hpu.yaml
@@ -0,0 +1,40 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  whisper-service:
+    image: ${REGISTRY:-opea}/whisper-gaudi:${TAG:-latest}
+    container_name: whisper-service
+    ports:
+      - "7066:7066"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:7066/health"]
+      interval: 10s
+      timeout: 6s
+      retries: 18
+  asr:
+    image: ${REGISTRY:-opea}/asr:${TAG:-latest}
+    container_name: asr-service
+    ports:
+      - "3001:9099"
+    ipc: host
+    environment:
+      ASR_ENDPOINT: ${ASR_ENDPOINT}
+    dependes_on:
+      speecht5-service:
+      condition: service_healthy
+
+networks:
+  default:
+    driver: bridge
--- a/comps/asr/whisper/Dockerfile
+++ b/comps/asr/whisper/Dockerfile
@@ -16,13 +16,13 @@ COPY comps /home/user/comps
 RUN pip install --no-cache-dir --upgrade pip setuptools && \
    if [ "${ARCH}" = "cpu" ]; then \
        pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu ; \
-        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/asr/whisper/requirements.txt ; \
+        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/asr/src/requirements.txt ; \
    else \
-        pip install --no-cache-dir -r /home/user/comps/asr/whisper/requirements.txt ; \
+        pip install --no-cache-dir -r /home/user/comps/asr/src/requirements.txt ; \
    fi

 ENV PYTHONPATH=$PYTHONPATH:/home/user

-WORKDIR /home/user/comps/asr/whisper
+WORKDIR /home/user/comps/asr/src

-ENTRYPOINT ["python", "asr.py"]
+ENTRYPOINT ["python", "opea_asr_microservice.py"]
--- a/comps/asr/whisper/README.md
+++ b/comps/asr/whisper/README.md
@@ -17,7 +17,7 @@ pip install -r requirements.txt
 - Xeon CPU

 ```bash
-cd dependency/
+cd integrations/dependency/whisper
 nohup python whisper_server.py --device=cpu &
 python check_whisper_server.py
 ```
@@ -51,15 +51,15 @@ curl http://localhost:7066/v1/audio/transcriptions \
 ### 1.3 Start ASR Service/Test

 ```bash
-cd ../
-python asr.py
+cd ../../..
+python opea_asr_microservice.py
 python check_asr_server.py
 ```

 While the Whisper service is running, you can start the ASR service. If the ASR service is running properly, you should see the output similar to the following:

 ```bash
-{'id': '0e686efd33175ce0ebcf7e0ed7431673', 'text': 'who is pat gelsinger'}
+{'text': 'who is pat gelsinger'}
 ```

 ## 🚀2. Start Microservice with Docker (Option 2)
@@ -74,20 +74,20 @@ Alternatively, you can also start the ASR microservice with Docker.

 ```bash
 cd ../..
-docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/dependency/Dockerfile .
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/Dockerfile .
 ```

 - Gaudi2 HPU

 ```bash
 cd ../..
-docker build -t opea/whisper-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/dependency/Dockerfile.intel_hpu .
+docker build -t opea/whisper-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/Dockerfile.intel_hpu .
 ```

 #### 2.1.2 ASR Service Image

 ```bash
-docker build -t opea/asr:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile .
+docker build -t opea/asr:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/Dockerfile .
 ```

 ### 2.2 Start Whisper and ASR Service
@@ -97,13 +97,13 @@ docker build -t opea/asr:latest --build-arg https_proxy=$https_proxy --build-arg
 - Xeon

 ```bash
-docker run -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest
+docker run -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy opea/whisper:latest
 ```

 - Gaudi2 HPU

 ```bash
-docker run -p 7066:7066 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper-gaudi:latest
+docker run -p 7066:7066 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy opea/whisper-gaudi:latest
 ```

 #### 2.2.2 Start ASR service
@@ -111,7 +111,7 @@ docker run -p 7066:7066 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_M
 ```bash
 ip_address=$(hostname -I | awk '{print $1}')

-docker run -d -p 9099:9099 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e ASR_ENDPOINT=http://$ip_address:7066 opea/asr:latest
+docker run -d -p 9099:9099 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e ASR_ENDPOINT=http://$ip_address:7066 opea/asr:latest
 ```

 #### 2.2.3 Test
@@ -120,8 +120,11 @@ docker run -d -p 9099:9099 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$
 # Use curl or python

 # curl
-http_proxy="" curl http://localhost:9099/v1/audio/transcriptions -XPOST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' -H 'Content-Type: application/json'
-
+wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
+curl http://localhost:9099/v1/audio/transcriptions \
+  -H "Content-Type: multipart/form-data" \
+  -F file="@./sample.wav" \
+  -F model="openai/whisper-small"

 # python
 python check_asr_server.py
--- a/comps/asr/whisper/check_asr_server.py
+++ b/comps/asr/whisper/check_asr_server.py
@@ -20,11 +20,24 @@ urllib.request.urlretrieve(
    file_name,
 )

-with open(file_name, "rb") as f:
-    test_audio_base64_str = base64.b64encode(f.read()).decode("utf-8")
-os.remove(file_name)
-
 endpoint = "http://localhost:9099/v1/audio/transcriptions"
-inputs = {"byte_str": test_audio_base64_str}
-response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None})
-print(response.json())
+headers = {"accept": "application/json"}
+
+# Prepare the data and files
+data = {
+    "model": "openai/whisper-small",
+    "language": "english",
+}
+
+try:
+    with open(file_name, "rb") as audio_file:
+        files = {"file": (file_name, audio_file)}
+        response = requests.post(endpoint, headers=headers, data=data, files=files)
+        if response.status_code != 200:
+            print(f"Failure with {response.reason}!")
+        else:
+            print(response.json())
+except Exception as e:
+    print(f"Failure with {e}!")
+
+os.remove(file_name)
--- a/comps/asr/src/integrations/dependency/whisper/Dockerfile
+++ b/comps/asr/src/integrations/dependency/whisper/Dockerfile
@@ -20,16 +20,16 @@ COPY --chown=user:user comps /home/user/comps
 USER user

 RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir -r /home/user/comps/asr/whisper/requirements.txt && \
+    pip install --no-cache-dir -r /home/user/comps/asr/src/requirements.txt && \
    if [ "${ARCH}" = "cpu" ]; then \
        pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu ; \
-        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/asr/whisper/requirements.txt ; \
+        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/asr/src/requirements.txt ; \
    else \
-        pip install --no-cache-dir -r /home/user/comps/asr/whisper/requirements.txt ; \
+        pip install --no-cache-dir -r /home/user/comps/asr/src/requirements.txt ; \
    fi

 ENV PYTHONPATH=$PYTHONPATH:/home/user

-WORKDIR /home/user/comps/asr/whisper/dependency
+WORKDIR /home/user/comps/asr/src/integrations/dependency/whisper

 ENTRYPOINT ["python", "whisper_server.py", "--device", "cpu"]
--- a/comps/asr/src/integrations/dependency/whisper/Dockerfile.intel_hpu
+++ b/comps/asr/src/integrations/dependency/whisper/Dockerfile.intel_hpu
@@ -23,11 +23,11 @@ USER user

 # Install requirements and optimum habana
 RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r /home/user/comps/asr/whisper/requirements.txt && \
+    pip install --no-cache-dir -r /home/user/comps/asr/src/requirements.txt && \
    pip install --no-cache-dir optimum[habana]

 ENV PYTHONPATH=$PYTHONPATH:/home/users

-WORKDIR /home/user/comps/asr/whisper/dependency
+WORKDIR /home/user/comps/asr/src/integrations/dependency/whisper

 ENTRYPOINT ["python", "whisper_server.py", "--device", "hpu"]
--- a/comps/asr/src/integrations/dependency/whisper/init.py
+++ b/comps/asr/src/integrations/dependency/whisper/init.py
--- a/comps/asr/src/integrations/dependency/whisper/check_whisper_server.py
+++ b/comps/asr/src/integrations/dependency/whisper/check_whisper_server.py
--- a/comps/asr/src/integrations/dependency/whisper/whisper_model.py
+++ b/comps/asr/src/integrations/dependency/whisper/whisper_model.py
--- a/comps/asr/src/integrations/dependency/whisper/whisper_server.py
+++ b/comps/asr/src/integrations/dependency/whisper/whisper_server.py
@@ -5,7 +5,7 @@ import argparse
 import base64
 import os
 import uuid
-from typing import List, Optional, Union
+from typing import List

 import uvicorn
 from fastapi import FastAPI, File, Form, Request, UploadFile
@@ -28,7 +28,7 @@ app.add_middleware(
 )


-@app.get("/v1/health")
+@app.get("/health")
 async def health() -> Response:
    """Health check."""
    return Response(status_code=200)
--- a/comps/asr/src/integrations/opea_whisper.py
+++ b/comps/asr/src/integrations/opea_whisper.py
@@ -0,0 +1,76 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import time
+from typing import List
+
+import requests
+from fastapi import File, Form, UploadFile
+
+from comps import CustomLogger, OpeaComponent, ServiceType
+from comps.cores.proto.api_protocol import AudioTranscriptionResponse
+
+logger = CustomLogger("opea_whisper")
+logflag = os.getenv("LOGFLAG", False)
+
+
+class OpeaWhisperAsr(OpeaComponent):
+    """A specialized ASR (Automatic Speech Recognition) component derived from OpeaComponent for Whisper ASR services.
+
+    Attributes:
+        model_name (str): The name of the ASR model used.
+    """
+
+    def __init__(self, name: str, description: str, config: dict = None):
+        super().__init__(name, ServiceType.ASR.name.lower(), description, config)
+        self.base_url = os.getenv("ASR_ENDPOINT", "http://localhost:7066")
+
+    async def invoke(
+        self,
+        file: UploadFile = File(...),  # Handling the uploaded file directly
+        model: str = Form("openai/whisper-small"),
+        language: str = Form("english"),
+        prompt: str = Form(None),
+        response_format: str = Form("json"),
+        temperature: float = Form(0),
+        timestamp_granularities: List[str] = Form(None),
+    ) -> AudioTranscriptionResponse:
+        """Involve the ASR service to generate transcription for the provided input."""
+        # Read the uploaded file
+        file_contents = await file.read()
+
+        # Prepare the files and data for requests.post
+        files = {
+            "file": (file.filename, file_contents, file.content_type),
+        }
+        data = {
+            "model": model,
+            "language": language,
+            "prompt": prompt,
+            "response_format": response_format,
+            "temperature": temperature,
+            "timestamp_granularities": timestamp_granularities,
+        }
+
+        # Send the file and model to the server
+        response = requests.post(f"{self.base_url}/v1/audio/transcriptions", files=files, data=data)
+        res = response.json()["text"]
+        return AudioTranscriptionResponse(text=res)
+
+    def check_health(self) -> bool:
+        """Checks the health of the embedding service.
+
+        Returns:
+            bool: True if the service is reachable and healthy, False otherwise.
+        """
+        try:
+            response = requests.get(f"{self.base_url}/health")
+            if response.status_code == 200:
+                return True
+            else:
+                return False
+        except Exception as e:
+            # Handle connection errors, timeouts, etc.
+            logger.error(f"Health check failed: {e}")
+        return False
--- a/comps/asr/src/opea_asr_microservice.py
+++ b/comps/asr/src/opea_asr_microservice.py
@@ -0,0 +1,94 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import time
+from typing import List
+
+from fastapi import File, Form, UploadFile
+from integrations.opea_whisper import OpeaWhisperAsr
+
+from comps import (
+    Base64ByteStrDoc,
+    CustomLogger,
+    LLMParamsDoc,
+    OpeaComponentController,
+    ServiceType,
+    opea_microservices,
+    register_microservice,
+    register_statistics,
+    statistics_dict,
+)
+from comps.cores.proto.api_protocol import AudioTranscriptionResponse
+
+logger = CustomLogger("opea_asr_microservice")
+logflag = os.getenv("LOGFLAG", False)
+
+# Initialize OpeaComponentController
+controller = OpeaComponentController()
+
+# Register components
+try:
+    # Instantiate ASR components
+    opea_whisper = OpeaWhisperAsr(
+        name="OpeaWhisperAsr",
+        description="OPEA Whisper ASR Service",
+    )
+
+    # Register components with the controller
+    controller.register(opea_whisper)
+
+    # Discover and activate a healthy component
+    controller.discover_and_activate()
+except Exception as e:
+    logger.error(f"Failed to initialize components: {e}")
+
+
+@register_microservice(
+    name="opea_service@asr",
+    service_type=ServiceType.ASR,
+    endpoint="/v1/audio/transcriptions",
+    host="0.0.0.0",
+    port=9099,
+    input_datatype=Base64ByteStrDoc,
+    output_datatype=LLMParamsDoc,
+)
+@register_statistics(names=["opea_service@asr"])
+async def audio_to_text(
+    file: UploadFile = File(...),  # Handling the uploaded file directly
+    model: str = Form("openai/whisper-small"),
+    language: str = Form("english"),
+    prompt: str = Form(None),
+    response_format: str = Form("json"),
+    temperature: float = Form(0),
+    timestamp_granularities: List[str] = Form(None),
+) -> AudioTranscriptionResponse:
+    start = time.time()
+
+    if logflag:
+        logger.info("ASR file uploaded.")
+
+    try:
+        # Use the controller to invoke the active component
+        asr_response = await controller.invoke(
+            file=file,
+            model=model,
+            language=language,
+            prompt=prompt,
+            response_format=response_format,
+            temperature=temperature,
+            timestamp_granularities=timestamp_granularities,
+        )
+        if logflag:
+            logger.info(asr_response)
+        statistics_dict["opea_service@asr"].append_latency(time.time() - start, None)
+        return asr_response
+
+    except Exception as e:
+        logger.error(f"Error during asr invocation: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    logger.info("OPEA ASR Microservice is starting....")
+    opea_microservices["opea_service@asr"].start()
--- a/comps/asr/whisper/requirements.txt
+++ b/comps/asr/whisper/requirements.txt
--- a/comps/asr/whisper/asr.py
+++ b/comps/asr/whisper/asr.py
@@ -1,53 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import json
-import os
-import time
-
-import requests
-
-from comps import CustomLogger
-
-logger = CustomLogger("asr")
-logflag = os.getenv("LOGFLAG", False)
-
-from comps import (
-    Base64ByteStrDoc,
-    LLMParamsDoc,
-    ServiceType,
-    opea_microservices,
-    register_microservice,
-    register_statistics,
-    statistics_dict,
-)
-
-
-@register_microservice(
-    name="opea_service@asr",
-    service_type=ServiceType.ASR,
-    endpoint="/v1/audio/transcriptions",
-    host="0.0.0.0",
-    port=9099,
-    input_datatype=Base64ByteStrDoc,
-    output_datatype=LLMParamsDoc,
-)
-@register_statistics(names=["opea_service@asr"])
-async def audio_to_text(audio: Base64ByteStrDoc):
-    start = time.time()
-    byte_str = audio.byte_str
-    inputs = {"audio": byte_str}
-    if logflag:
-        logger.info(inputs)
-
-    response = requests.post(url=f"{asr_endpoint}/v1/asr", data=json.dumps(inputs), proxies={"http": None})
-    if logflag:
-        logger.info(response)
-    statistics_dict["opea_service@asr"].append_latency(time.time() - start, None)
-    return LLMParamsDoc(query=response.json()["asr_result"])
-
-
-if __name__ == "__main__":
-    asr_endpoint = os.getenv("ASR_ENDPOINT", "http://localhost:7066")
-    logger.info("[asr - router] ASR initialized.")
-    opea_microservices["opea_service@asr"].start()
--- a/comps/cores/proto/api_protocol.py
+++ b/comps/cores/proto/api_protocol.py
@@ -279,6 +279,7 @@ class DocSumChatCompletionRequest(BaseModel):

 class AudioChatCompletionRequest(BaseModel):
    audio: str
+    voice: str = "default"
    messages: Optional[
        Union[
            str,
--- a/comps/tts/deployment/docker_compose/compose_gptsovits.yaml
+++ b/comps/tts/deployment/docker_compose/compose_gptsovits.yaml
@@ -0,0 +1,35 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  gpt-sovits-service:
+    image: ${REGISTRY:-opea}/gpt-sovits:${TAG:-latest}
+    container_name: gpt-sovits-service
+    ports:
+      - "9880:9880"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9880/health"]
+      interval: 10s
+      timeout: 6s
+      retries: 18
+  tts:
+    image: ${REGISTRY:-opea}/tts:${TAG:-latest}
+    container_name: tts-service
+    ports:
+      - "3002:9088"
+    ipc: host
+    environment:
+      TTS_ENDPOINT: ${TTS_ENDPOINT}
+    dependes_on:
+      speecht5-service:
+      condition: service_healthy
+
+networks:
+  default:
+    driver: bridge
--- a/comps/tts/deployment/docker_compose/compose_speecht5.yaml
+++ b/comps/tts/deployment/docker_compose/compose_speecht5.yaml
@@ -0,0 +1,35 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  speecht5-service:
+    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
+    container_name: speecht5-service
+    ports:
+      - "7055:7055"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:7055/health"]
+      interval: 10s
+      timeout: 6s
+      retries: 18
+  tts:
+    image: ${REGISTRY:-opea}/tts:${TAG:-latest}
+    container_name: tts-service
+    ports:
+      - "3002:9088"
+    ipc: host
+    environment:
+      TTS_ENDPOINT: ${TTS_ENDPOINT}
+    dependes_on:
+      speecht5-service:
+      condition: service_healthy
+
+networks:
+  default:
+    driver: bridge
--- a/comps/tts/deployment/docker_compose/compose_speect5_hpu.yaml
+++ b/comps/tts/deployment/docker_compose/compose_speect5_hpu.yaml
@@ -0,0 +1,40 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  speecht5-service:
+    image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest}
+    container_name: speecht5-service
+    ports:
+      - "7055:7055"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:7055/health"]
+      interval: 10s
+      timeout: 6s
+      retries: 18
+  tts:
+    image: ${REGISTRY:-opea}/tts:${TAG:-latest}
+    container_name: tts-service
+    ports:
+      - "3002:9088"
+    ipc: host
+    environment:
+      TTS_ENDPOINT: ${TTS_ENDPOINT}
+    dependes_on:
+      speecht5-service:
+      condition: service_healthy
+
+networks:
+  default:
+    driver: bridge
--- a/comps/tts/gpt-sovits/init.py
+++ b/comps/tts/gpt-sovits/init.py
@@ -1,2 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
--- a/comps/tts/speecht5/init.py
+++ b/comps/tts/speecht5/init.py
@@ -1,2 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
--- a/comps/tts/speecht5/dependency/init.py
+++ b/comps/tts/speecht5/dependency/init.py
@@ -1,2 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
--- a/comps/tts/speecht5/tts.py
+++ b/comps/tts/speecht5/tts.py
@@ -1,53 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import json
-import os
-import time
-
-import requests
-
-from comps import (
-    Base64ByteStrDoc,
-    CustomLogger,
-    ServiceType,
-    TextDoc,
-    opea_microservices,
-    register_microservice,
-    register_statistics,
-    statistics_dict,
-)
-
-logger = CustomLogger("tts")
-logflag = os.getenv("LOGFLAG", False)
-
-
-@register_microservice(
-    name="opea_service@tts",
-    service_type=ServiceType.TTS,
-    endpoint="/v1/audio/speech",
-    host="0.0.0.0",
-    port=9088,
-    input_datatype=TextDoc,
-    output_datatype=Base64ByteStrDoc,
-)
-@register_statistics(names=["opea_service@tts"])
-async def text_to_audio(input: TextDoc):
-    if logflag:
-        logger.info(input)
-    start = time.time()
-    text = input.text
-    inputs = {"text": text}
-
-    response = requests.post(url=f"{tts_endpoint}/v1/tts", data=json.dumps(inputs), proxies={"http": None})
-    statistics_dict["opea_service@tts"].append_latency(time.time() - start, None)
-    result = Base64ByteStrDoc(byte_str=response.json()["tts_result"])
-    if logflag:
-        logger.info(result)
-    return result
-
-
-if __name__ == "__main__":
-    tts_endpoint = os.getenv("TTS_ENDPOINT", "http://localhost:7055")
-    logger.info("[tts - router] TTS initialized.")
-    opea_microservices["opea_service@tts"].start()
--- a/comps/tts/speecht5/Dockerfile
+++ b/comps/tts/speecht5/Dockerfile
@@ -14,13 +14,13 @@ COPY comps /home/user/comps
 RUN pip install --no-cache-dir --upgrade pip setuptools && \
    if [ "${ARCH}" = "cpu" ]; then \
        pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu ; \
-        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/tts/speecht5/requirements.txt ; \
+        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/tts/src/requirements.txt ; \
    else \
-        pip install --no-cache-dir -r /home/user/comps/tts/speecht5/requirements.txt ; \
+        pip install --no-cache-dir -r /home/user/comps/tts/src/requirements.txt ; \
    fi

 ENV PYTHONPATH=$PYTHONPATH:/home/user

-WORKDIR /home/user/comps/tts/speecht5
+WORKDIR /home/user/comps/tts/src

-ENTRYPOINT ["python", "tts.py"]
+ENTRYPOINT ["python", "opea_tts_microservice.py"]
--- a/comps/tts/speecht5/README.md
+++ b/comps/tts/speecht5/README.md
@@ -7,7 +7,7 @@ TTS (Text-To-Speech) microservice helps users convert text to speech. When build
 - Xeon CPU

 ```bash
-cd dependency/
+cd integrations/dependency/speecht5
 nohup python speecht5_server.py --device=cpu &
 curl http://localhost:7055/v1/tts -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json'
 ```
@@ -17,7 +17,7 @@ curl http://localhost:7055/v1/tts -XPOST -d '{"text": "Who are you?"}' -H 'Conte
 ```bash
 pip install optimum[habana]

-cd dependency/
+cd integrations/dependency/speecht5
 nohup python speecht5_server.py --device=hpu &
 curl http://localhost:7055/v1/tts -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json'
 ```
@@ -25,9 +25,9 @@ curl http://localhost:7055/v1/tts -XPOST -d '{"text": "Who are you?"}' -H 'Conte
 ## 1.3 Start TTS Service/Test

 ```bash
-python tts.py
+python opea_tts_microservice.py

-curl http://localhost:9088/v1/audio/speech -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json'
+curl http://localhost:9088/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
 ```

 ## 🚀2. Start Microservice with Docker (Option 2)
@@ -42,20 +42,20 @@ Alternatively, you can start the TTS microservice with Docker.

 ```bash
 cd ../../../
-docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/speecht5/dependency/Dockerfile .
+docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile .
 ```

 - Gaudi2 HPU

 ```bash
 cd ../../../
-docker build -t opea/speecht5-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/speecht5/dependency/Dockerfile.intel_hpu .
+docker build -t opea/speecht5-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile.intel_hpu .
 ```

 #### 2.1.2 TTS Service Image

 ```bash
-docker build -t opea/tts:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/speecht5/Dockerfile .
+docker build -t opea/tts:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/Dockerfile .
 ```

 ### 2.2 Start SpeechT5 and TTS Service
@@ -89,7 +89,5 @@ curl http://localhost:7055/v1/tts -XPOST -d '{"text": "Who are you?"}' -H 'Conte

 # openai protocol compatible
 # voice can be 'male' or 'default'
-curl http://localhost:7055/v1/audio/speech -XPOST -d '{"input":"Who are you?", "voice": "male"}' -H 'Content-Type: application/json' --output speech.wav
-
-curl http://localhost:9088/v1/audio/speech -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json'
+curl http://localhost:9088/v1/audio/speech -XPOST -d '{"input":"Who are you?", "voice": "male"}' -H 'Content-Type: application/json' --output speech.wav
 ```
--- a/comps/tts/src/integrations/dependency/gpt-sovits/Dockerfile
+++ b/comps/tts/src/integrations/dependency/gpt-sovits/Dockerfile
--- a/comps/tts/src/integrations/dependency/gpt-sovits/README.md
+++ b/comps/tts/src/integrations/dependency/gpt-sovits/README.md
@@ -7,7 +7,7 @@ This microservice is validated on Xeon/CUDA. HPU support is under development.
 ## Build the Image

 ```bash
-docker build -t opea/gpt-sovits:latest --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -f comps/tts/gpt-sovits/Dockerfile .
+docker build -t opea/gpt-sovits:latest --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -f comps/tts/src/integrations/dependency/gpt-sovits/Dockerfile .
 ```

 ## Start the Service
--- a/comps/tts/src/integrations/dependency/speecht5/Dockerfile
+++ b/comps/tts/src/integrations/dependency/speecht5/Dockerfile
@@ -23,13 +23,13 @@ USER user
 RUN pip install --no-cache-dir --upgrade pip setuptools && \
    if [ "${ARCH}" = "cpu" ]; then \
        pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu ; \
-        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/tts/speecht5/requirements.txt ; \
+        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/tts/src/integrations/dependency/speecht5/requirements.txt ; \
    else \
-        pip install --no-cache-dir -r /home/user/comps/tts/speecht5/requirements.txt ; \
+        pip install --no-cache-dir -r /home/user/comps/tts/src/integrations/dependency/speecht5/requirements.txt ; \
    fi

 ENV PYTHONPATH=$PYTHONPATH:/home/user

-WORKDIR /home/user/comps/tts/speecht5/dependency
+WORKDIR /home/user/comps/tts/src/integrations/dependency/speecht5

 ENTRYPOINT ["python", "speecht5_server.py", "--device", "cpu"]
--- a/comps/tts/src/integrations/dependency/speecht5/Dockerfile.intel_hpu
+++ b/comps/tts/src/integrations/dependency/speecht5/Dockerfile.intel_hpu
@@ -24,11 +24,11 @@ USER user

 # Install requirements and optimum habana
 RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r /home/user/comps/tts/speecht5/requirements.txt && \
+    pip install --no-cache-dir -r /home/user/comps/tts/src/integrations/dependency/speecht5/requirements.txt && \
    pip install --no-cache-dir optimum[habana]

 ENV PYTHONPATH=$PYTHONPATH:/home/user

-WORKDIR /home/user/comps/tts/speecht5/dependency
+WORKDIR /home/user/comps/tts/src/integrations/dependency/speecht5

 ENTRYPOINT ["python", "speecht5_server.py", "--device", "hpu"]
--- a/comps/tts/src/integrations/dependency/speecht5/init.py
+++ b/comps/tts/src/integrations/dependency/speecht5/init.py
--- a/comps/tts/src/integrations/dependency/speecht5/requirements.txt
+++ b/comps/tts/src/integrations/dependency/speecht5/requirements.txt
--- a/comps/tts/src/integrations/dependency/speecht5/speecht5_model.py
+++ b/comps/tts/src/integrations/dependency/speecht5/speecht5_model.py
@@ -24,6 +24,7 @@ class SpeechT5Model:
        self.processor = SpeechT5Processor.from_pretrained(self.model_name_or_path, normalize=True)
        self.vocoder = SpeechT5HifiGan.from_pretrained(vocoder_model_name_or_path).to(device)
        self.vocoder.eval()
+        self.voice = "default"

        # fetch default speaker embedding
        try:
@@ -89,8 +90,13 @@ class SpeechT5Model:
        )

    def t2s(self, text, voice="default"):
-        if voice == "male":
-            self.default_speaker_embedding = torch.load("spk_embed_male.pt")
+        if self.voice != voice:
+            try:
+                print(f"Loading spk embedding with voice: {voice}.")
+                self.default_speaker_embedding = torch.load("spk_embed_{voice}.pt")
+                self.voice = voice
+            except Exception as e:
+                print(e)
        if self.device == "hpu":
            # See https://github.com/huggingface/optimum-habana/pull/824
            from optimum.habana.utils import set_seed
--- a/comps/tts/src/integrations/dependency/speecht5/speecht5_server.py
+++ b/comps/tts/src/integrations/dependency/speecht5/speecht5_server.py
@@ -26,7 +26,7 @@ app.add_middleware(
 )


-@app.get("/v1/health")
+@app.get("/health")
 async def health() -> Response:
    """Health check."""
    return Response(status_code=200)
@@ -37,8 +37,9 @@ async def text_to_speech(request: Request):
    logger.info("SpeechT5 generation begin.")
    request_dict = await request.json()
    text = request_dict.pop("text")
+    voice = request_dict.pop("voice", "default")

-    speech = tts.t2s(text)
+    speech = tts.t2s(text, voice)
    sf.write("tmp.wav", speech, samplerate=16000)
    with open("tmp.wav", "rb") as f:
        bytes = f.read()
@@ -48,13 +49,8 @@ async def text_to_speech(request: Request):


@app.post("/v1/audio/speech")
-async def audio_speech(request: AudioSpeechRequest):
+async def audio_speech(request: AudioSpeechRequest) -> StreamingResponse:
    logger.info("SpeechT5 generation begin.")
-    # validate the request parameters
-    if request.model != tts.model_name_or_path:
-        raise Exception("TTS model mismatch! Currently only support model: microsoft/speecht5_tts")
-    if request.voice not in ["default", "male"] or request.speed != 1.0:
-        logger.warning("Currently parameter 'speed' can only be 1.0 and 'voice' can only be default or male!")

    speech = tts.t2s(request.input, voice=request.voice)

--- a/comps/tts/src/integrations/opea_gptsovits.py
+++ b/comps/tts/src/integrations/opea_gptsovits.py
@@ -0,0 +1,55 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import time
+
+import requests
+from fastapi.responses import StreamingResponse
+
+from comps import CustomLogger, OpeaComponent, ServiceType
+from comps.cores.proto.api_protocol import AudioSpeechRequest
+
+logger = CustomLogger("opea_gptsovits")
+logflag = os.getenv("LOGFLAG", False)
+
+
+class OpeaGptsovitsTts(OpeaComponent):
+    """A specialized TTS (Text To Speech) component derived from OpeaComponent for GPTSoVITS TTS services.
+
+    Attributes:
+        model_name (str): The name of the TTS model used.
+    """
+
+    def __init__(self, name: str, description: str, config: dict = None):
+        super().__init__(name, ServiceType.TTS.name.lower(), description, config)
+        self.base_url = os.getenv("TTS_ENDPOINT", "http://localhost:9880")
+
+    async def invoke(
+        self,
+        request: AudioSpeechRequest,
+    ) -> requests.models.Response:
+        """Involve the TTS service to generate speech for the provided input."""
+        # see https://github.com/Spycsh/GPT-SoVITS/blob/openai_compat/api.py#L948 for usage
+        # make sure you change the refer_wav_path locally
+        request.voice = None
+
+        response = requests.post(f"{self.base_url}/v1/audio/speech", data=request.json())
+        return response
+
+    def check_health(self) -> bool:
+        """Checks the health of the embedding service.
+
+        Returns:
+            bool: True if the service is reachable and healthy, False otherwise.
+        """
+        try:
+            response = requests.get(f"{self.base_url}/health")
+            if response.status_code == 200:
+                return True
+            else:
+                return False
+        except Exception as e:
+            # Handle connection errors, timeouts, etc.
+            logger.error(f"Health check failed: {e}")
+        return False
--- a/comps/tts/src/integrations/opea_speecht5.py
+++ b/comps/tts/src/integrations/opea_speecht5.py
@@ -0,0 +1,57 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import time
+
+import requests
+from fastapi.responses import StreamingResponse
+
+from comps import CustomLogger, OpeaComponent, ServiceType
+from comps.cores.proto.api_protocol import AudioSpeechRequest
+
+logger = CustomLogger("opea_speecht5")
+logflag = os.getenv("LOGFLAG", False)
+
+
+class OpeaSpeecht5Tts(OpeaComponent):
+    """A specialized TTS (Text To Speech) component derived from OpeaComponent for SpeechT5 TTS services.
+
+    Attributes:
+        model_name (str): The name of the TTS model used.
+    """
+
+    def __init__(self, name: str, description: str, config: dict = None):
+        super().__init__(name, ServiceType.TTS.name.lower(), description, config)
+        self.base_url = os.getenv("TTS_ENDPOINT", "http://localhost:7055")
+
+    def invoke(
+        self,
+        request: AudioSpeechRequest,
+    ) -> requests.models.Response:
+        """Involve the TTS service to generate speech for the provided input."""
+        # validate the request parameters
+        if request.model not in ["microsoft/speecht5_tts"]:
+            raise Exception("TTS model mismatch! Currently only support model: microsoft/speecht5_tts")
+        if request.voice not in ["default", "male"] or request.speed != 1.0:
+            logger.warning("Currently parameter 'speed' can only be 1.0 and 'voice' can only be default or male!")
+
+        response = requests.post(f"{self.base_url}/v1/audio/speech", data=request.json())
+        return response
+
+    def check_health(self) -> bool:
+        """Checks the health of the embedding service.
+
+        Returns:
+            bool: True if the service is reachable and healthy, False otherwise.
+        """
+        try:
+            response = requests.get(f"{self.base_url}/health")
+            if response.status_code == 200:
+                return True
+            else:
+                return False
+        except Exception as e:
+            # Handle connection errors, timeouts, etc.
+            logger.error(f"Health check failed: {e}")
+        return False
--- a/comps/tts/src/opea_tts_microservice.py
+++ b/comps/tts/src/opea_tts_microservice.py
@@ -0,0 +1,88 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import time
+
+from fastapi.responses import StreamingResponse
+from integrations.opea_gptsovits import OpeaGptsovitsTts
+from integrations.opea_speecht5 import OpeaSpeecht5Tts
+
+from comps import (
+    CustomLogger,
+    OpeaComponentController,
+    ServiceType,
+    opea_microservices,
+    register_microservice,
+    register_statistics,
+    statistics_dict,
+)
+from comps.cores.proto.api_protocol import AudioSpeechRequest
+
+logger = CustomLogger("opea_tts_microservice")
+logflag = os.getenv("LOGFLAG", False)
+
+# Initialize OpeaComponentController
+controller = OpeaComponentController()
+
+# Register components
+try:
+    # Instantiate TTS components
+    opea_speecht5 = OpeaSpeecht5Tts(
+        name="OpeaSpeecht5Tts",
+        description="OPEA SpeechT5 TTS Service",
+    )
+
+    opea_gptsovits = OpeaGptsovitsTts(
+        name="OpeaGptsovitsTts",
+        description="OPEA GPTSoVITS TTS Service",
+    )
+
+    # Register components with the controller
+    controller.register(opea_speecht5)
+    controller.register(opea_gptsovits)
+
+    # Discover and activate a healthy component
+    controller.discover_and_activate()
+except Exception as e:
+    logger.error(f"Failed to initialize components: {e}")
+
+
+async def stream_forwarder(response):
+    """Forward the stream chunks to the client using iter_content."""
+    for chunk in response.iter_content(chunk_size=1024):
+        yield chunk
+
+
+@register_microservice(
+    name="opea_service@tts",
+    service_type=ServiceType.TTS,
+    endpoint="/v1/audio/speech",
+    host="0.0.0.0",
+    port=9088,
+    input_datatype=AudioSpeechRequest,
+    output_datatype=StreamingResponse,
+)
+@register_statistics(names=["opea_service@tts"])
+async def text_to_speech(request: AudioSpeechRequest) -> StreamingResponse:
+    start = time.time()
+
+    if logflag:
+        logger.info(f"Input received: {request}")
+
+    try:
+        # Use the controller to invoke the active component
+        tts_response = controller.invoke(request)
+        if logflag:
+            logger.info(tts_response)
+        statistics_dict["opea_service@tts"].append_latency(time.time() - start, None)
+        return StreamingResponse(stream_forwarder(tts_response))
+
+    except Exception as e:
+        logger.error(f"Error during tts invocation: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    logger.info("OPEA TTS Microservice is starting....")
+    opea_microservices["opea_service@tts"].start()
--- a/comps/tts/src/requirements.txt
+++ b/comps/tts/src/requirements.txt
@@ -0,0 +1,11 @@
+aiohttp
+docarray[full]
+fastapi
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+prometheus-fastapi-instrumentator
+pydantic==2.9.1
+pyyaml
+shortuuid
+uvicorn