From a01729a5c2722d0eebb2ce1987e922bf4dcf4549 Mon Sep 17 00:00:00 2001
From: Sihan Chen <39623753+Spycsh@users.noreply.github.com>
Date: Thu, 26 Dec 2024 14:45:17 +0800
Subject: [PATCH] Refactor DocSum example (#1286)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 DocSum/Dockerfile                             |   3 +-
 .../docker_compose/amd/gpu/rocm/compose.yaml  |  31 +-
 DocSum/docker_compose/amd/gpu/rocm/set_env.sh |   8 -
 .../docker_compose/intel/cpu/xeon/README.md   |  63 +--
 .../intel/cpu/xeon/compose.yaml               |  33 +-
 .../docker_compose/intel/hpu/gaudi/README.md  |  59 +--
 .../intel/hpu/gaudi/compose.yaml              |  33 +-
 DocSum/docker_compose/set_env.sh              |  14 +-
 DocSum/docker_image_build/build.yaml          |  27 --
 DocSum/docsum.py                              | 118 ++++--
 .../intel/cpu/xeon/manifest/docsum.yaml       | 347 ----------------
 .../intel/hpu/gaudi/manifest/docsum.yaml      | 375 +-----------------
 DocSum/tests/test_compose_on_gaudi.sh         |  54 +--
 DocSum/tests/test_compose_on_rocm.sh          |  51 +--
 DocSum/tests/test_compose_on_xeon.sh          |  54 +--
 DocSum/ui/gradio/docsum_ui_gradio.py          |  18 +-
 16 files changed, 145 insertions(+), 1143 deletions(-)

diff --git a/DocSum/Dockerfile b/DocSum/Dockerfile
index 183aff49d..27e08ee7a 100644
--- a/DocSum/Dockerfile
+++ b/DocSum/Dockerfile
@@ -6,7 +6,8 @@ FROM python:3.11-slim
 RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
     libgl1-mesa-glx \
     libjemalloc-dev \
-    git
+    git \
+    ffmpeg
 
 RUN useradd -m -s /bin/bash user && \
     mkdir -p /home/user && \
diff --git a/DocSum/docker_compose/amd/gpu/rocm/compose.yaml b/DocSum/docker_compose/amd/gpu/rocm/compose.yaml
index 933317001..fa36310ad 100644
--- a/DocSum/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/DocSum/docker_compose/amd/gpu/rocm/compose.yaml
@@ -70,34 +70,6 @@ services:
       https_proxy: ${https_proxy}
     restart: unless-stopped
 
-  dataprep-audio2text:
-    image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest}
-    container_name: dataprep-audio2text-service
-    ports:
-      - "9099:9099"
-    ipc: host
-    environment:
-      A2T_ENDPOINT: ${A2T_ENDPOINT}
-
-  dataprep-video2audio:
-    image: ${REGISTRY:-opea}/dataprep-video2audio:${TAG:-latest}
-    container_name: dataprep-video2audio-service
-    ports:
-      - "7078:7078"
-    ipc: host
-    environment:
-      V2A_ENDPOINT: ${V2A_ENDPOINT}
-
-  dataprep-multimedia2text:
-    image: ${REGISTRY:-opea}/dataprep-multimedia2text:${TAG:-latest}
-    container_name: dataprep-multimedia2text
-    ports:
-      - "7079:7079"
-    ipc: host
-    environment:
-      V2A_ENDPOINT: ${V2A_ENDPOINT}
-      A2T_ENDPOINT: ${A2T_ENDPOINT}
-
   docsum-backend-server:
     image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
     container_name: docsum-backend-server
@@ -111,8 +83,9 @@ services:
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${HOST_IP}
-      - DATA_SERVICE_HOST_IP=${DATA_SERVICE_HOST_IP}
       - LLM_SERVICE_HOST_IP=${HOST_IP}
+      - ASR_SERVICE_HOST_IP=${ASR_SERVICE_HOST_IP}
+
     ipc: host
     restart: always
 
diff --git a/DocSum/docker_compose/amd/gpu/rocm/set_env.sh b/DocSum/docker_compose/amd/gpu/rocm/set_env.sh
index 0b48a19fb..797c6b8a4 100644
--- a/DocSum/docker_compose/amd/gpu/rocm/set_env.sh
+++ b/DocSum/docker_compose/amd/gpu/rocm/set_env.sh
@@ -15,11 +15,3 @@ export DOCSUM_LLM_SERVER_PORT="9000"
 export DOCSUM_BACKEND_SERVER_PORT="8888"
 export DOCSUM_FRONTEND_PORT="5173"
 export BACKEND_SERVICE_ENDPOINT="http://${HOST_IP}:${DOCSUM_BACKEND_SERVER_PORT}/v1/docsum"
-export V2A_SERVICE_HOST_IP=${host_ip}
-export V2A_ENDPOINT=http://$host_ip:7078
-export A2T_ENDPOINT=http://$host_ip:7066
-export A2T_SERVICE_HOST_IP=${host_ip}
-export A2T_SERVICE_PORT=9099
-export DATA_ENDPOINT=http://$host_ip:7079
-export DATA_SERVICE_HOST_IP=${host_ip}
-export DATA_SERVICE_PORT=7079
diff --git a/DocSum/docker_compose/intel/cpu/xeon/README.md b/DocSum/docker_compose/intel/cpu/xeon/README.md
index 3a3828bf2..5c579e82c 100644
--- a/DocSum/docker_compose/intel/cpu/xeon/README.md
+++ b/DocSum/docker_compose/intel/cpu/xeon/README.md
@@ -29,30 +29,6 @@ The Whisper Service converts audio files to text. Follow these steps to build an
 docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/dependency/Dockerfile .
 ```
 
-#### Audio to text Service
-
-The Audio to text Service is another service for converting audio to text. Follow these steps to build and run the service:
-
-```bash
-docker build -t opea/dataprep-audio2text:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/audio2text/Dockerfile .
-```
-
-#### Video to Audio Service
-
-The Video to Audio Service extracts audio from video files. Follow these steps to build and run the service:
-
-```bash
-docker build -t opea/dataprep-video2audio:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/video2audio/Dockerfile .
-```
-
-#### Multimedia to Text Service
-
-The Multimedia to Text Service transforms multimedia data to text data. Follow these steps to build and run the service:
-
-```bash
-docker build -t opea/dataprep-multimedia2text:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/Dockerfile .
-```
-
 ### 2. Build MegaService Docker Image
 
 To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
@@ -149,9 +125,6 @@ You will have the following Docker Images:
 2. `opea/docsum:latest`
 3. `opea/llm-docsum-tgi:latest`
 4. `opea/whisper:latest`
-5. `opea/dataprep-audio2text:latest`
-6. `opea/dataprep-multimedia2text:latest`
-7. `opea/dataprep-video2audio:latest`
 
 ### Validate Microservices
 
@@ -188,37 +161,7 @@ You will have the following Docker Images:
      {"asr_result":"you"}
    ```
 
-4. Audio2Text Microservice
-
-   ```bash
-    curl http://${host_ip}:9099/v1/audio/transcriptions \
-        -X POST \
-        -d '{"byte_str":"UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
-        -H 'Content-Type: application/json'
-   ```
-
-   Expected output:
-
-   ```bash
-     {"downstream_black_list":[],"id":"--> this will be different id number for each run <--","query":"you"}
-   ```
-
-5. Multimedia to text Microservice
-
-   ```bash
-    curl http://${host_ip}:7079/v1/multimedia2text \
-        -X POST \
-        -d '{"audio":"UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
-        -H 'Content-Type: application/json'
-   ```
-
-   Expected output:
-
-   ```bash
-     {"downstream_black_list":[],"id":"--> this will be different id number for each run <--","query":"you"}
-   ```
-
-6. MegaService
+4. MegaService
 
    Text:
 
@@ -257,7 +200,7 @@ You will have the following Docker Images:
       -F "stream=true"
    ```
 
-   > Audio and Video file uploads are not supported in docsum with curl request, please use the Gradio-UI.
+   > Audio and Video file uploads are not supported in docsum with curl request, please use the Gradio-UI. You can still pass base64 string of the audio or video file as follows:
 
    Audio:
 
@@ -291,7 +234,7 @@ You will have the following Docker Images:
       -F "stream=true"
    ```
 
-7. MegaService with long context
+5. MegaService with long context
 
    If you want to deal with long context, can set following parameters and select suitable summary type.
 
diff --git a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
index a0285d9ce..42e89ee25 100644
--- a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
@@ -50,43 +50,12 @@ services:
       https_proxy: ${https_proxy}
     restart: unless-stopped
 
-  dataprep-audio2text:
-    image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest}
-    container_name: dataprep-audio2text-server
-    ports:
-      - "9099:9099"
-    ipc: host
-    environment:
-      A2T_ENDPOINT: ${A2T_ENDPOINT}
-
-  dataprep-video2audio:
-    image: ${REGISTRY:-opea}/dataprep-video2audio:${TAG:-latest}
-    container_name: dataprep-video2audio-server
-    ports:
-      - "7078:7078"
-    ipc: host
-    environment:
-      V2A_ENDPOINT: ${V2A_ENDPOINT}
-
-  dataprep-multimedia2text:
-    image: ${REGISTRY:-opea}/dataprep-multimedia2text:${TAG:-latest}
-    container_name: dataprep-multimedia2text
-    ports:
-      - "7079:7079"
-    ipc: host
-    environment:
-      V2A_ENDPOINT: ${V2A_ENDPOINT}
-      A2T_ENDPOINT: ${A2T_ENDPOINT}
-
   docsum-xeon-backend-server:
     image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
     container_name: docsum-xeon-backend-server
     depends_on:
       - tgi-server
       - llm-docsum-tgi
-      - dataprep-multimedia2text
-      - dataprep-video2audio
-      - dataprep-audio2text
     ports:
       - "8888:8888"
     environment:
@@ -94,8 +63,8 @@ services:
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - DATA_SERVICE_HOST_IP=${DATA_SERVICE_HOST_IP}
       - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+      - ASR_SERVICE_HOST_IP=${ASR_SERVICE_HOST_IP}
     ipc: host
     restart: always
 
diff --git a/DocSum/docker_compose/intel/hpu/gaudi/README.md b/DocSum/docker_compose/intel/hpu/gaudi/README.md
index e47ed2f43..5a9deec17 100644
--- a/DocSum/docker_compose/intel/hpu/gaudi/README.md
+++ b/DocSum/docker_compose/intel/hpu/gaudi/README.md
@@ -13,28 +13,12 @@ git clone https://github.com/opea-project/GenAIComps.git
 cd GenAIComps
 ```
 
-#### Audio to text Service
+#### Whisper Service
 
-The Audio to text Service is another service for converting audio to text. Follow these steps to build and run the service:
+The Whisper Service converts audio files to text. Follow these steps to build and run the service:
 
 ```bash
-docker build -t opea/dataprep-audio2text:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/audio2text/Dockerfile .
-```
-
-#### Video to Audio Service
-
-The Video to Audio Service extracts audio from video files. Follow these steps to build and run the service:
-
-```bash
-docker build -t opea/dataprep-video2audio:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/video2audio/Dockerfile .
-```
-
-#### Multimedia to Text Service
-
-The Multimedia to Text Service transforms multimedia data to text data. Follow these steps to build and run the service:
-
-```bash
-docker build -t opea/dataprep-multimedia2text:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/Dockerfile .
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/dependency/Dockerfile .
 ```
 
 ### 2. Build MegaService Docker Image
@@ -133,9 +117,6 @@ You will have the following Docker Images:
 2. `opea/docsum:latest`
 3. `opea/llm-docsum-tgi:latest`
 4. `opea/whisper:latest`
-5. `opea/dataprep-audio2text:latest`
-6. `opea/dataprep-multimedia2text:latest`
-7. `opea/dataprep-video2audio:latest`
 
 ### Validate Microservices
 
@@ -172,37 +153,7 @@ You will have the following Docker Images:
      {"asr_result":"you"}
    ```
 
-4. Audio2Text Microservice
-
-   ```bash
-    curl http://${host_ip}:9199/v1/audio/transcriptions \
-        -X POST \
-        -d '{"byte_str":"UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
-        -H 'Content-Type: application/json'
-   ```
-
-   Expected output:
-
-   ```bash
-     {"downstream_black_list":[],"id":"--> this will be different id number for each run <--","query":"you"}
-   ```
-
-5. Multimedia to text Microservice
-
-   ```bash
-    curl http://${host_ip}:7079/v1/multimedia2text \
-        -X POST \
-        -d '{"audio":"UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
-        -H 'Content-Type: application/json'
-   ```
-
-   Expected output:
-
-   ```bash
-     {"downstream_black_list":[],"id":"--> this will be different id number for each run <--","query":"you"}
-   ```
-
-6. MegaService
+4. MegaService
 
    Text:
 
@@ -274,7 +225,7 @@ You will have the following Docker Images:
       -F "stream=True"
    ```
 
-7. MegaService with long context
+5. MegaService with long context
 
    If you want to deal with long context, can set following parameters and select suitable summary type.
 
diff --git a/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml b/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml
index 78d2dba5a..e9ab3e163 100644
--- a/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -62,43 +62,12 @@ services:
       - SYS_NICE
     restart: unless-stopped
 
-  dataprep-audio2text:
-    image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest}
-    container_name: dataprep-audio2text-server
-    ports:
-      - "9199:9099"
-    ipc: host
-    environment:
-      A2T_ENDPOINT: ${A2T_ENDPOINT}
-
-  dataprep-video2audio:
-    image: ${REGISTRY:-opea}/dataprep-video2audio:${TAG:-latest}
-    container_name: dataprep-video2audio-server
-    ports:
-      - "7078:7078"
-    ipc: host
-    environment:
-      V2A_ENDPOINT: ${V2A_ENDPOINT}
-
-  dataprep-multimedia2text:
-    image: ${REGISTRY:-opea}/dataprep-multimedia2text:${TAG:-latest}
-    container_name: dataprep-multimedia2text
-    ports:
-      - "7079:7079"
-    ipc: host
-    environment:
-      V2A_ENDPOINT: ${V2A_ENDPOINT}
-      A2T_ENDPOINT: ${A2T_ENDPOINT}
-
   docsum-gaudi-backend-server:
     image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
     container_name: docsum-gaudi-backend-server
     depends_on:
       - tgi-server
       - llm-docsum-tgi
-      - dataprep-multimedia2text
-      - dataprep-video2audio
-      - dataprep-audio2text
     ports:
       - "8888:8888"
     environment:
@@ -106,8 +75,8 @@ services:
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - DATA_SERVICE_HOST_IP=${DATA_SERVICE_HOST_IP}
       - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+      - ASR_SERVICE_HOST_IP=${ASR_SERVICE_HOST_IP}
 
     ipc: host
     restart: always
diff --git a/DocSum/docker_compose/set_env.sh b/DocSum/docker_compose/set_env.sh
index f48a48243..ffe52a04f 100644
--- a/DocSum/docker_compose/set_env.sh
+++ b/DocSum/docker_compose/set_env.sh
@@ -13,15 +13,7 @@ export no_proxy="${no_proxy},${host_ip}"
 export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
 export MEGA_SERVICE_HOST_IP=${host_ip}
 export LLM_SERVICE_HOST_IP=${host_ip}
+export ASR_SERVICE_HOST_IP=${host_ip}
+export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
-
-export V2A_SERVICE_HOST_IP=${host_ip}
-export V2A_ENDPOINT=http://$host_ip:7078
-
-export A2T_ENDPOINT=http://$host_ip:7066
-export A2T_SERVICE_HOST_IP=${host_ip}
-export A2T_SERVICE_PORT=9099
-
-export DATA_ENDPOINT=http://$host_ip:7079
-export DATA_SERVICE_HOST_IP=${host_ip}
-export DATA_SERVICE_PORT=7079
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index da777ebb7..9701c86d1 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -41,33 +41,6 @@ services:
       dockerfile: comps/asr/whisper/dependency/Dockerfile
     extends: docsum
     image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
-  dataprep-multimedia2text:
-    build:
-      args:
-        http_proxy: ${http_proxy}
-        https_proxy: ${https_proxy}
-      context: GenAIComps
-      dockerfile: comps/dataprep/multimedia2text/Dockerfile
-    extends: docsum
-    image: ${REGISTRY:-opea}/dataprep-multimedia2text:${TAG:-latest}
-  dataprep-audio2text:
-    build:
-      args:
-        http_proxy: ${http_proxy}
-        https_proxy: ${https_proxy}
-      context: GenAIComps
-      dockerfile: comps/dataprep/multimedia2text/audio2text/Dockerfile
-    extends: docsum
-    image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest}
-  dataprep-video2audio:
-    build:
-      args:
-        http_proxy: ${http_proxy}
-        https_proxy: ${https_proxy}
-      context: GenAIComps
-      dockerfile: comps/dataprep/multimedia2text/video2audio/Dockerfile
-    extends: docsum
-    image: ${REGISTRY:-opea}/dataprep-video2audio:${TAG:-latest}
   llm-docsum-tgi:
     build:
       context: GenAIComps
diff --git a/DocSum/docsum.py b/DocSum/docsum.py
index 86ecf6979..a640c0f08 100644
--- a/DocSum/docsum.py
+++ b/DocSum/docsum.py
@@ -2,7 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
+import base64
 import os
+import subprocess
+import uuid
 from typing import List
 
 from comps import MegaServiceEndpoint, MicroService, ServiceOrchestrator, ServiceRoleType, ServiceType
@@ -20,8 +23,8 @@ from fastapi.responses import StreamingResponse
 
 MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 8888))
 
-DATA_SERVICE_HOST_IP = os.getenv("DATA_SERVICE_HOST_IP", "0.0.0.0")
-DATA_SERVICE_PORT = int(os.getenv("DATA_SERVICE_PORT", 7079))
+ASR_SERVICE_HOST_IP = os.getenv("ASR_SERVICE_HOST_IP", "0.0.0.0")
+ASR_SERVICE_PORT = int(os.getenv("ASR_SERVICE_PORT", 7066))
 
 LLM_SERVICE_HOST_IP = os.getenv("LLM_SERVICE_HOST_IP", "0.0.0.0")
 LLM_SERVICE_PORT = int(os.getenv("LLM_SERVICE_PORT", 9000))
@@ -29,11 +32,20 @@ LLM_SERVICE_PORT = int(os.getenv("LLM_SERVICE_PORT", 9000))
 
 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
     if self.services[cur_node].service_type == ServiceType.LLM:
+        for key_to_replace in ["text", "asr_result"]:
+            if key_to_replace in inputs:
+                inputs["query"] = inputs[key_to_replace]
+                del inputs[key_to_replace]
+
         docsum_parameters = kwargs.get("docsum_parameters", None)
         if docsum_parameters:
             docsum_parameters = docsum_parameters.model_dump()
             del docsum_parameters["query"]
             inputs.update(docsum_parameters)
+    elif self.services[cur_node].service_type == ServiceType.ASR:
+        if "video" in inputs:
+            audio_base64 = video2audio(inputs["video"])
+            inputs["audio"] = audio_base64
     return inputs
 
 
@@ -45,6 +57,44 @@ def read_pdf(file):
     return docs
 
 
+def video2audio(
+    video_base64: str,
+) -> str:
+    """Convert a base64 video string to a base64 audio string using ffmpeg.
+
+    Args:
+        video_base64 (str): Base64 encoded video string.
+
+    Returns:
+        str: Base64 encoded audio string.
+    """
+    video_data = base64.b64decode(video_base64)
+
+    uid = str(uuid.uuid4())
+    temp_video_path = f"{uid}.mp4"
+    temp_audio_path = f"{uid}.mp3"
+    with open(temp_video_path, "wb") as video_file:
+        video_file.write(video_data)
+
+    try:
+        subprocess.run(
+            ["ffmpeg", "-i", temp_video_path, "-q:a", "0", "-map", "a", temp_audio_path],
+            check=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.STDOUT,
+        )
+        # Read the extracted audio file and encode it to base64
+        with open(temp_audio_path, "rb") as audio_file:
+            audio_base64 = base64.b64encode(audio_file.read()).decode("utf-8")
+
+    finally:
+        # Clean up the temporary video file
+        os.remove(temp_video_path)
+        os.remove(temp_audio_path)
+
+    return audio_base64
+
+
 def read_text_from_file(file, save_file_name):
     import docx2txt
     from langchain.text_splitter import CharacterTextSplitter
@@ -78,17 +128,18 @@ class DocSumService:
         self.port = port
         ServiceOrchestrator.align_inputs = align_inputs
         self.megaservice = ServiceOrchestrator()
+        self.megaservice_text_only = ServiceOrchestrator()
         self.endpoint = str(MegaServiceEndpoint.DOC_SUMMARY)
 
     def add_remote_service(self):
 
-        data = MicroService(
-            name="multimedia2text",
-            host=DATA_SERVICE_HOST_IP,
-            port=DATA_SERVICE_PORT,
-            endpoint="/v1/multimedia2text",
+        asr = MicroService(
+            name="asr",
+            host=ASR_SERVICE_HOST_IP,
+            port=ASR_SERVICE_PORT,
+            endpoint="/v1/asr",
             use_remote_service=True,
-            service_type=ServiceType.DATAPREP,
+            service_type=ServiceType.ASR,
         )
 
         llm = MicroService(
@@ -100,10 +151,12 @@ class DocSumService:
             service_type=ServiceType.LLM,
         )
 
-        self.megaservice.add(data).add(llm)
-        self.megaservice.flow_to(data, llm)
+        self.megaservice.add(asr).add(llm)
+        self.megaservice.flow_to(asr, llm)
+        self.megaservice_text_only.add(llm)
 
     async def handle_request(self, request: Request, files: List[UploadFile] = File(default=None)):
+        """Accept pure text, or files .txt/.pdf.docx, audio/video base64 string."""
 
         if "application/json" in request.headers.get("content-type"):
             data = await request.json()
@@ -129,11 +182,15 @@ class DocSumService:
             file_summaries = []
             if files:
                 for file in files:
-                    file_path = f"/tmp/{file.filename}"
+                    # Fix concurrency issue with the same file name
+                    # https://github.com/opea-project/GenAIExamples/issues/1279
+                    uid = str(uuid.uuid4())
+                    file_path = f"/tmp/{uid}"
 
                     if data_type is not None and data_type in ["audio", "video"]:
                         raise ValueError(
-                            "Audio and Video file uploads are not supported in docsum with curl request, please use the UI."
+                            "Audio and Video file uploads are not supported in docsum with curl request, \
+                                please use the UI or pass base64 string of the content directly."
                         )
 
                     else:
@@ -181,19 +238,34 @@ class DocSumService:
             chunk_overlap=chunk_overlap,
             chunk_size=chunk_size,
         )
+        text_only = "text" in initial_inputs_data
+        if not text_only:
+            result_dict, runtime_graph = await self.megaservice.schedule(
+                initial_inputs=initial_inputs_data, docsum_parameters=docsum_parameters
+            )
 
-        result_dict, runtime_graph = await self.megaservice.schedule(
-            initial_inputs=initial_inputs_data, docsum_parameters=docsum_parameters
-        )
+            for node, response in result_dict.items():
+                # Here it suppose the last microservice in the megaservice is LLM.
+                if (
+                    isinstance(response, StreamingResponse)
+                    and node == list(self.megaservice.services.keys())[-1]
+                    and self.megaservice.services[node].service_type == ServiceType.LLM
+                ):
+                    return response
+        else:
+            result_dict, runtime_graph = await self.megaservice_text_only.schedule(
+                initial_inputs=initial_inputs_data, docsum_parameters=docsum_parameters
+            )
+
+            for node, response in result_dict.items():
+                # Here it suppose the last microservice in the megaservice is LLM.
+                if (
+                    isinstance(response, StreamingResponse)
+                    and node == list(self.megaservice.services.keys())[-1]
+                    and self.megaservice.services[node].service_type == ServiceType.LLM
+                ):
+                    return response
 
-        for node, response in result_dict.items():
-            # Here it suppose the last microservice in the megaservice is LLM.
-            if (
-                isinstance(response, StreamingResponse)
-                and node == list(self.megaservice.services.keys())[-1]
-                and self.megaservice.services[node].service_type == ServiceType.LLM
-            ):
-                return response
         last_node = runtime_graph.all_leaves()[-1]
         response = result_dict[last_node]["text"]
         choices = []
diff --git a/DocSum/kubernetes/intel/cpu/xeon/manifest/docsum.yaml b/DocSum/kubernetes/intel/cpu/xeon/manifest/docsum.yaml
index f51c01f2a..fe708a77e 100644
--- a/DocSum/kubernetes/intel/cpu/xeon/manifest/docsum.yaml
+++ b/DocSum/kubernetes/intel/cpu/xeon/manifest/docsum.yaml
@@ -135,67 +135,6 @@ data:
   HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
   HF_HOME: "/tmp/.cache/huggingface"
 ---
-# Source: docsum/charts/tgi/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: docsum-audio2text-config
-  labels:
-    helm.sh/chart: audio2text-1.0.0
-    app.kubernetes.io/name: audio2text
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "2.1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  A2T_ENDPOINT: "http://docsum-whisper"
----
-# Source: docsum/charts/tgi/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: docsum-video2audio-config
-  labels:
-    helm.sh/chart: video2audio-1.0.0
-    app.kubernetes.io/name: video2audio
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "2.1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  V2A_ENDPOINT: "http://docsum-video2audio"
----
-# Source: docsum/charts/tgi/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: docsum-multimedia2text-config
-  labels:
-    helm.sh/chart: multimedia2text-1.0.0
-    app.kubernetes.io/name: multimedia2text
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "2.1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  V2A_ENDPOINT: "http://docsum-video2audio"
-  A2T_ENDPOINT: "http://docsum-whisper"
----
 # Source: docsum/charts/tgi/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -221,81 +160,6 @@ spec:
     app.kubernetes.io/name: whisper
     app.kubernetes.io/instance: docsum
 ---
-# Source: docsum/charts/tgi/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: docsum-audio2text
-  labels:
-    helm.sh/chart: audio2text-1.0.0
-    app.kubernetes.io/name: audio2text
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "2.1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 9099
-      targetPort: 9199
-      protocol: TCP
-      name: audio2text
-  selector:
-    app.kubernetes.io/name: audio2text
-    app.kubernetes.io/instance: docsum
----
-# Source: docsum/charts/tgi/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: docsum-video2audio
-  labels:
-    helm.sh/chart: video2audio-1.0.0
-    app.kubernetes.io/name: video2audio
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "2.1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 7078
-      targetPort: 7078
-      protocol: TCP
-      name: video2audio
-  selector:
-    app.kubernetes.io/name: video2audio
-    app.kubernetes.io/instance: docsum
----
-# Source: docsum/charts/tgi/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: docsum-multimedia2text
-  labels:
-    helm.sh/chart: multimedia2text-1.0.0
-    app.kubernetes.io/name: multimedia2text
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "2.1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 7079
-      targetPort: 7079
-      protocol: TCP
-      name: multimedia2text
-  selector:
-    app.kubernetes.io/name: multimedia2text
-    app.kubernetes.io/instance: docsum
----
 # Source: docsum/charts/docsum-ui/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -801,214 +665,3 @@ spec:
       volumes:
         - name: tmp
           emptyDir: {}
----
-# Source: docsum/charts/audio2text/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: docsum-audio2text
-  labels:
-    helm.sh/chart: audio2text-1.0.0
-    app.kubernetes.io/name: audio2text
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: audio2text
-      app.kubernetes.io/instance: docsum
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: audio2text
-        app.kubernetes.io/instance: docsum
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: docsum
-          envFrom:
-            - configMapRef:
-                name: docsum-audio2text-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: false
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/dataprep-audio2text:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: audio2text
-              containerPort: 9199
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
-# Source: docsum/charts/video2audio/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: docsum-video2audio
-  labels:
-    helm.sh/chart: video2audio-1.0.0
-    app.kubernetes.io/name: video2audio
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: video2audio
-      app.kubernetes.io/instance: docsum
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: video2audio
-        app.kubernetes.io/instance: docsum
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: docsum
-          envFrom:
-            - configMapRef:
-                name: docsum-video2audio-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: false
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/dataprep-video2audio:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: video2audio
-              containerPort: 7078
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: video2audio
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: video2audio
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: video2audio
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
-# Source: docsum/charts/multimedia2text/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: docsum-multimedia2text
-  labels:
-    helm.sh/chart: multimedia2text-1.0.0
-    app.kubernetes.io/name: multimedia2text
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: multimedia2text
-      app.kubernetes.io/instance: docsum
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: multimedia2text
-        app.kubernetes.io/instance: docsum
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: docsum
-          envFrom:
-            - configMapRef:
-                name: docsum-multimedia2text-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: false
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/dataprep-multimedia2text:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: multimedia2text
-              containerPort: 7079
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: multimedia2text
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: multimedia2text
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: multimedia2text
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
diff --git a/DocSum/kubernetes/intel/hpu/gaudi/manifest/docsum.yaml b/DocSum/kubernetes/intel/hpu/gaudi/manifest/docsum.yaml
index 9eae01e68..c3d1128ab 100644
--- a/DocSum/kubernetes/intel/hpu/gaudi/manifest/docsum.yaml
+++ b/DocSum/kubernetes/intel/hpu/gaudi/manifest/docsum.yaml
@@ -136,66 +136,30 @@ data:
   HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
   HF_HOME: "/tmp/.cache/huggingface"
 ---
-# Source: docsum/charts/tgi/templates/configmap.yaml
+# Source: docsum/charts/tgi/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 apiVersion: v1
-kind: ConfigMap
+kind: Service
 metadata:
-  name: docsum-audio2text-config
+  name: docsum-whisper
   labels:
-    helm.sh/chart: audio2text-1.0.0
-    app.kubernetes.io/name: audio2text
+    helm.sh/chart: whisper-1.0.0
+    app.kubernetes.io/name: whisper
     app.kubernetes.io/instance: docsum
     app.kubernetes.io/version: "2.1.0"
     app.kubernetes.io/managed-by: Helm
-data:
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  A2T_ENDPOINT: "http://docsum-whisper"
----
-# Source: docsum/charts/tgi/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: docsum-video2audio-config
-  labels:
-    helm.sh/chart: video2audio-1.0.0
-    app.kubernetes.io/name: video2audio
+spec:
+  type: ClusterIP
+  ports:
+    - port: 7066
+      targetPort: 7066
+      protocol: TCP
+      name: whisper
+  selector:
+    app.kubernetes.io/name: whisper
     app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "2.1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  V2A_ENDPOINT: "http://docsum-video2audio"
----
-# Source: docsum/charts/tgi/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: docsum-multimedia2text-config
-  labels:
-    helm.sh/chart: multimedia2text-1.0.0
-    app.kubernetes.io/name: multimedia2text
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "2.1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  V2A_ENDPOINT: "http://docsum-video2audio"
-  A2T_ENDPOINT: "http://docsum-whisper"
 ---
 # Source: docsum/charts/docsum-ui/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
@@ -288,106 +252,6 @@ spec:
     app: docsum-nginx
   type: NodePort
 ---
-# Source: docsum/charts/tgi/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: docsum-whisper
-  labels:
-    helm.sh/chart: whisper-1.0.0
-    app.kubernetes.io/name: whisper
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "2.1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 7066
-      targetPort: 7066
-      protocol: TCP
-      name: whisper
-  selector:
-    app.kubernetes.io/name: whisper
-    app.kubernetes.io/instance: docsum
----
-# Source: docsum/charts/tgi/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: docsum-audio2text
-  labels:
-    helm.sh/chart: audio2text-1.0.0
-    app.kubernetes.io/name: audio2text
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "2.1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 9099
-      targetPort: 9199
-      protocol: TCP
-      name: audio2text
-  selector:
-    app.kubernetes.io/name: audio2text
-    app.kubernetes.io/instance: docsum
----
-# Source: docsum/charts/tgi/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: docsum-video2audio
-  labels:
-    helm.sh/chart: video2audio-1.0.0
-    app.kubernetes.io/name: video2audio
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "2.1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 7078
-      targetPort: 7078
-      protocol: TCP
-      name: video2audio
-  selector:
-    app.kubernetes.io/name: video2audio
-    app.kubernetes.io/instance: docsum
----
-# Source: docsum/charts/tgi/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: docsum-multimedia2text
-  labels:
-    helm.sh/chart: multimedia2text-1.0.0
-    app.kubernetes.io/name: multimedia2text
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "2.1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 7079
-      targetPort: 7079
-      protocol: TCP
-      name: multimedia2text
-  selector:
-    app.kubernetes.io/name: multimedia2text
-    app.kubernetes.io/instance: docsum
----
 # Source: docsum/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -805,214 +669,3 @@ spec:
       volumes:
         - name: tmp
           emptyDir: {}
----
-# Source: docsum/charts/audio2text/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: docsum-audio2text
-  labels:
-    helm.sh/chart: audio2text-1.0.0
-    app.kubernetes.io/name: audio2text
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: audio2text
-      app.kubernetes.io/instance: docsum
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: audio2text
-        app.kubernetes.io/instance: docsum
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: docsum
-          envFrom:
-            - configMapRef:
-                name: docsum-audio2text-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: false
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/dataprep-audio2text:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: audio2text
-              containerPort: 9199
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
-# Source: docsum/charts/video2audio/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: docsum-video2audio
-  labels:
-    helm.sh/chart: video2audio-1.0.0
-    app.kubernetes.io/name: video2audio
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: video2audio
-      app.kubernetes.io/instance: docsum
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: video2audio
-        app.kubernetes.io/instance: docsum
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: docsum
-          envFrom:
-            - configMapRef:
-                name: docsum-video2audio-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: false
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/dataprep-video2audio:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: video2audio
-              containerPort: 7078
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: video2audio
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: video2audio
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: video2audio
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
-# Source: docsum/charts/multimedia2text/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: docsum-multimedia2text
-  labels:
-    helm.sh/chart: multimedia2text-1.0.0
-    app.kubernetes.io/name: multimedia2text
-    app.kubernetes.io/instance: docsum
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: multimedia2text
-      app.kubernetes.io/instance: docsum
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: multimedia2text
-        app.kubernetes.io/instance: docsum
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: docsum
-          envFrom:
-            - configMapRef:
-                name: docsum-multimedia2text-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: false
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/dataprep-multimedia2text:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: multimedia2text
-              containerPort: 7079
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: multimedia2text
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: multimedia2text
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: multimedia2text
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
diff --git a/DocSum/tests/test_compose_on_gaudi.sh b/DocSum/tests/test_compose_on_gaudi.sh
index e554f7268..6287ade8c 100644
--- a/DocSum/tests/test_compose_on_gaudi.sh
+++ b/DocSum/tests/test_compose_on_gaudi.sh
@@ -21,20 +21,10 @@ export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export MEGA_SERVICE_HOST_IP=${host_ip}
 export LLM_SERVICE_HOST_IP=${host_ip}
+export ASR_SERVICE_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
 export no_proxy="${no_proxy},${host_ip}"
 
-export V2A_SERVICE_HOST_IP=${host_ip}
-export V2A_ENDPOINT=http://$host_ip:7078
-
-export A2T_ENDPOINT=http://$host_ip:7066
-export A2T_SERVICE_HOST_IP=${host_ip}
-export A2T_SERVICE_PORT=9199
-
-export DATA_ENDPOINT=http://$host_ip:7079
-export DATA_SERVICE_HOST_IP=${host_ip}
-export DATA_SERVICE_PORT=7079
-
 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
 
@@ -47,7 +37,7 @@ function build_docker_images() {
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="docsum docsum-gradio-ui whisper dataprep-multimedia2text dataprep-audio2text dataprep-video2audio llm-docsum-tgi"
+    service_list="docsum docsum-gradio-ui whisper llm-docsum-tgi"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
@@ -187,46 +177,6 @@ function validate_microservices() {
         "whisper-server" \
         "{\"audio\": \"$(input_data_for_test "audio")\"}"
 
-    # Audio2Text service
-    validate_services_json \
-        "${host_ip}:9199/v1/audio/transcriptions" \
-        '"query":"well"' \
-        "dataprep-audio2text" \
-        "dataprep-audio2text-server" \
-        "{\"byte_str\": \"$(input_data_for_test "audio")\"}"
-
-    # Video2Audio service
-    validate_services_json \
-        "${host_ip}:7078/v1/video2audio" \
-        "SUQzBAAAAAAAI1RTU0UAAAAPAAADTGF2ZjU4LjI5LjEwMAAAAAAAAAAAAAAA//tQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASW5mbwAAAA8AAAAIAAAN3wAtLS0tLS0tLS0tLS1LS0tLS0tLS0tLS0tpaWlpaWlpaWlpaWlph4eHh4eHh4eHh4eHpaWlpaWlpaWlpaWlpcPDw8PDw8PDw8PDw+Hh4eHh4eHh4eHh4eH///////////////8AAAAATGF2YzU4LjU0AAAAAAAAAAAAAAAAJAYwAAAAAAAADd95t4qPAAAAAAAAAAAAAAAAAAAAAP/7kGQAAAMhClSVMEACMOAabaCMAREA" \
-        "dataprep-video2audio" \
-        "dataprep-video2audio-server" \
-        "{\"byte_str\": \"$(input_data_for_test "video")\"}"
-
-    # Docsum Data service - video
-    validate_services_json \
-        "${host_ip}:7079/v1/multimedia2text" \
-        "well" \
-        "dataprep-multimedia2text" \
-        "dataprep-multimedia2text" \
-        "{\"video\": \"$(input_data_for_test "video")\"}"
-
-    # Docsum Data service - audio
-    validate_services_json \
-        "${host_ip}:7079/v1/multimedia2text" \
-        "well" \
-        "dataprep-multimedia2text" \
-        "dataprep-multimedia2text" \
-        "{\"audio\": \"$(input_data_for_test "audio")\"}"
-
-    # Docsum Data service - text
-    validate_services_json \
-        "${host_ip}:7079/v1/multimedia2text" \
-        "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco" \
-        "dataprep-multimedia2text" \
-        "dataprep-multimedia2text" \
-        "{\"text\": \"$(input_data_for_test "text")\"}"
-
 }
 
 function validate_megaservice_text() {
diff --git a/DocSum/tests/test_compose_on_rocm.sh b/DocSum/tests/test_compose_on_rocm.sh
index 0045f1064..5f3083d8f 100644
--- a/DocSum/tests/test_compose_on_rocm.sh
+++ b/DocSum/tests/test_compose_on_rocm.sh
@@ -29,24 +29,17 @@ export DOCSUM_BACKEND_SERVER_PORT="8888"
 export DOCSUM_FRONTEND_PORT="5552"
 export MEGA_SERVICE_HOST_IP=${host_ip}
 export LLM_SERVICE_HOST_IP=${host_ip}
+export ASR_SERVICE_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/docsum"
 export DOCSUM_CARD_ID="card1"
 export DOCSUM_RENDER_ID="renderD136"
-export V2A_SERVICE_HOST_IP=${host_ip}
-export V2A_ENDPOINT=http://${host_ip}:7078
-export A2T_ENDPOINT=http://${host_ip}:7066
-export A2T_SERVICE_HOST_IP=${host_ip}
-export A2T_SERVICE_PORT=9099
-export DATA_ENDPOINT=http://${host_ip}:7079
-export DATA_SERVICE_HOST_IP=${host_ip}
-export DATA_SERVICE_PORT=7079
 
 function build_docker_images() {
     cd $WORKPATH/docker_image_build
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="docsum docsum-gradio-ui whisper dataprep-multimedia2text dataprep-audio2text dataprep-video2audio llm-docsum-tgi"
+    service_list="docsum docsum-gradio-ui whisper llm-docsum-tgi"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/text-generation-inference:1.4
@@ -141,46 +134,6 @@ function validate_microservices() {
         "whisper-service" \
         "{\"audio\": \"$(input_data_for_test "audio")\"}"
 
-    # Audio2Text service
-    validate_services \
-        "${host_ip}:9099/v1/audio/transcriptions" \
-        '"query":"well"' \
-        "dataprep-audio2text" \
-        "dataprep-audio2text-service" \
-        "{\"byte_str\": \"$(input_data_for_test "audio")\"}"
-
-    # Video2Audio service
-    validate_services \
-        "${host_ip}:7078/v1/video2audio" \
-        "SUQzBAAAAAAAI1RTU0UAAAAPAAADTGF2ZjU4LjI5LjEwMAAAAAAAAAAAAAAA//tQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASW5mbwAAAA8AAAAIAAAN3wAtLS0tLS0tLS0tLS1LS0tLS0tLS0tLS0tpaWlpaWlpaWlpaWlph4eHh4eHh4eHh4eHpaWlpaWlpaWlpaWlpcPDw8PDw8PDw8PDw+Hh4eHh4eHh4eHh4eH///////////////8AAAAATGF2YzU4LjU0AAAAAAAAAAAAAAAAJAYwAAAAAAAADd95t4qPAAAAAAAAAAAAAAAAAAAAAP/7kGQAAAMhClSVMEACMOAabaCMAREA" \
-        "dataprep-video2audio" \
-        "dataprep-video2audio-service" \
-        "{\"byte_str\": \"$(input_data_for_test "video")\"}"
-
-    # Docsum Data service - video
-    validate_services \
-        "${host_ip}:7079/v1/multimedia2text" \
-        "well" \
-        "dataprep-multimedia2text-service" \
-        "dataprep-multimedia2text" \
-        "{\"video\": \"$(input_data_for_test "video")\"}"
-
-    # Docsum Data service - audio
-    validate_services \
-        "${host_ip}:7079/v1/multimedia2text" \
-        "well" \
-        "dataprep-multimedia2text-service" \
-        "dataprep-multimedia2text" \
-        "{\"audio\": \"$(input_data_for_test "audio")\"}"
-
-    # Docsum Data service - text
-    validate_services \
-        "${host_ip}:7079/v1/multimedia2text" \
-        "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco" \
-        "dataprep-multimedia2text-service" \
-        "dataprep-multimedia2text" \
-        "{\"text\": \"$(input_data_for_test "text")\"}"
-
     # tgi for llm service
     validate_services \
         "${host_ip}:8008/generate" \
diff --git a/DocSum/tests/test_compose_on_xeon.sh b/DocSum/tests/test_compose_on_xeon.sh
index da664a775..91d5ece1b 100644
--- a/DocSum/tests/test_compose_on_xeon.sh
+++ b/DocSum/tests/test_compose_on_xeon.sh
@@ -21,20 +21,10 @@ export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export MEGA_SERVICE_HOST_IP=${host_ip}
 export LLM_SERVICE_HOST_IP=${host_ip}
+export ASR_SERVICE_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
 export no_proxy="${no_proxy},${host_ip}"
 
-export V2A_SERVICE_HOST_IP=${host_ip}
-export V2A_ENDPOINT=http://$host_ip:7078
-
-export A2T_ENDPOINT=http://$host_ip:7066
-export A2T_SERVICE_HOST_IP=${host_ip}
-export A2T_SERVICE_PORT=9099
-
-export DATA_ENDPOINT=http://$host_ip:7079
-export DATA_SERVICE_HOST_IP=${host_ip}
-export DATA_SERVICE_PORT=7079
-
 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
 
@@ -46,7 +36,7 @@ function build_docker_images() {
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="docsum docsum-gradio-ui whisper dataprep-multimedia2text dataprep-audio2text dataprep-video2audio llm-docsum-tgi"
+    service_list="docsum docsum-gradio-ui whisper llm-docsum-tgi"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/text-generation-inference:1.4
@@ -189,46 +179,6 @@ function validate_microservices() {
         "whisper-server" \
         "{\"audio\": \"$(input_data_for_test "audio")\"}"
 
-    # Audio2Text service
-    validate_services_json \
-        "${host_ip}:9099/v1/audio/transcriptions" \
-        '"query":"well"' \
-        "dataprep-audio2text" \
-        "dataprep-audio2text-server" \
-        "{\"byte_str\": \"$(input_data_for_test "audio")\"}"
-
-    # Video2Audio service
-    validate_services_json \
-        "${host_ip}:7078/v1/video2audio" \
-        "SUQzBAAAAAAAI1RTU0UAAAAPAAADTGF2ZjU4LjI5LjEwMAAAAAAAAAAAAAAA//tQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASW5mbwAAAA8AAAAIAAAN3wAtLS0tLS0tLS0tLS1LS0tLS0tLS0tLS0tpaWlpaWlpaWlpaWlph4eHh4eHh4eHh4eHpaWlpaWlpaWlpaWlpcPDw8PDw8PDw8PDw+Hh4eHh4eHh4eHh4eH///////////////8AAAAATGF2YzU4LjU0AAAAAAAAAAAAAAAAJAYwAAAAAAAADd95t4qPAAAAAAAAAAAAAAAAAAAAAP/7kGQAAAMhClSVMEACMOAabaCMAREA" \
-        "dataprep-video2audio" \
-        "dataprep-video2audio-server" \
-        "{\"byte_str\": \"$(input_data_for_test "video")\"}"
-
-    # Docsum Data service - video
-    validate_services_json \
-        "${host_ip}:7079/v1/multimedia2text" \
-        "well" \
-        "dataprep-multimedia2text" \
-        "dataprep-multimedia2text" \
-        "{\"video\": \"$(input_data_for_test "video")\"}"
-
-    # Docsum Data service - audio
-    validate_services_json \
-        "${host_ip}:7079/v1/multimedia2text" \
-        "well" \
-        "dataprep-multimedia2text" \
-        "dataprep-multimedia2text" \
-        "{\"audio\": \"$(input_data_for_test "audio")\"}"
-
-    # Docsum Data service - text
-    validate_services_json \
-        "${host_ip}:7079/v1/multimedia2text" \
-        "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco" \
-        "dataprep-multimedia2text" \
-        "dataprep-multimedia2text" \
-        "{\"text\": \"$(input_data_for_test "text")\"}"
-
 }
 
 function validate_megaservice_text() {
diff --git a/DocSum/ui/gradio/docsum_ui_gradio.py b/DocSum/ui/gradio/docsum_ui_gradio.py
index 01e1c3121..fa2b78cea 100644
--- a/DocSum/ui/gradio/docsum_ui_gradio.py
+++ b/DocSum/ui/gradio/docsum_ui_gradio.py
@@ -77,7 +77,7 @@ class DocSumUI:
         """
         logger.info(">>> Reading audio file: %s", file.name)
         base64_str = self.encode_file_to_base64(file)
-        return self.generate_summary(base64_str, document_type="audio")
+        return base64_str
 
     def read_video_file(self, file):
         """Read and process the content of a video file.
@@ -90,7 +90,7 @@ class DocSumUI:
         """
         logger.info(">>> Reading video file: %s", file.name)
         base64_str = self.encode_file_to_base64(file)
-        return self.generate_summary(base64_str, document_type="video")
+        return base64_str
 
     def is_valid_url(self, url):
         try:
@@ -193,7 +193,7 @@ class DocSumUI:
 
         return str(response.status_code)
 
-    def create_upload_ui(self, label, file_types, process_function):
+    def create_upload_ui(self, label, file_types, process_function, document_type="text"):
         """Create a Gradio UI for file uploads.
 
         Args:
@@ -213,7 +213,11 @@ class DocSumUI:
                     generated_text = gr.TextArea(
                         label="Text Summary", placeholder="Summarized text will be displayed here"
                     )
-            upload_btn.upload(lambda file: self.generate_summary(process_function(file)), upload_btn, generated_text)
+            upload_btn.upload(
+                lambda file: self.generate_summary(process_function(file), document_type=document_type),
+                upload_btn,
+                generated_text,
+            )
         return upload_ui
 
     def render(self):
@@ -269,11 +273,15 @@ class DocSumUI:
             label="Please upload audio file (.wav, .mp3)",
             file_types=[".wav", ".mp3"],
             process_function=self.read_audio_file,
+            document_type="audio",
         )
 
         # Video Upload UI
         video_ui = self.create_upload_ui(
-            label="Please upload Video file (.mp4)", file_types=[".mp4"], process_function=self.read_video_file
+            label="Please upload Video file (.mp4)",
+            file_types=[".mp4"],
+            process_function=self.read_video_file,
+            document_type="video",
         )
 
         # Render all the UI in separate tabs