[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
update translation UI response format
2025-03-21 08:03:31 +00:00 · 2025-03-21 15:57:43 +08:00
94 changed files with 624 additions and 4892 deletions
--- a/.github/workflows/_build_image.yml
+++ b/.github/workflows/_build_image.yml
@@ -1,103 +0,0 @@
-# Copyright (C) 2025 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-name: Build Images
-permissions: read-all
-on:
-  workflow_call:
-    inputs:
-      node:
-        required: true
-        type: string
-      build:
-        default: true
-        required: false
-        type: boolean
-      example:
-        required: true
-        type: string
-      services:
-        default: ""
-        required: false
-        type: string
-      tag:
-        default: "latest"
-        required: false
-        type: string
-      opea_branch:
-        default: "main"
-        required: false
-        type: string
-      inject_commit:
-        default: false
-        required: false
-        type: boolean
-
-jobs:
-  pre-build-image-check:
-    runs-on: ubuntu-latest
-    outputs:
-      should_skip: ${{ steps.check-skip.outputs.should_skip }}
-    steps:
-      - name: Check if job should be skipped
-        id: check-skip
-        run: |
-          should_skip=false
-          if [[ "${{ inputs.node }}" == "gaudi3" || "${{ inputs.node }}" == "rocm" || "${{ inputs.node }}" == "arc" ]]; then
-            should_skip=true
-          fi
-          echo "should_skip=$should_skip"
-          echo "should_skip=$should_skip" >> $GITHUB_OUTPUT
-
-  build-images:
-    needs: [ pre-build-image-check ]
-    if: ${{ needs.pre-build-image-check.outputs.should_skip == 'false' && fromJSON(inputs.build) }}
-    runs-on: "docker-build-${{ inputs.node }}"
-    steps:
-      - name: Clean Up Working Directory
-        run: sudo rm -rf ${{github.workspace}}/*
-
-      - name: Get Checkout Ref
-        run: |
-          if [ "${{ github.event_name }}" == "pull_request" ] || [ "${{ github.event_name }}" == "pull_request_target" ]; then
-            echo "CHECKOUT_REF=refs/pull/${{ github.event.number }}/merge" >> $GITHUB_ENV
-          else
-            echo "CHECKOUT_REF=${{ github.ref }}" >> $GITHUB_ENV
-          fi
-
-      - name: Checkout out GenAIExamples
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ env.CHECKOUT_REF }}
-          fetch-depth: 0
-
-      - name: Clone Required Repo
-        run: |
-          cd ${{ github.workspace }}/${{ inputs.example }}/docker_image_build
-          docker_compose_path=${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml
-          if [[ $(grep -c "vllm:" ${docker_compose_path}) != 0 ]]; then
-              git clone https://github.com/vllm-project/vllm.git && cd vllm
-              # Get the latest tag
-              VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
-              echo "Check out vLLM tag ${VLLM_VER}"
-              git checkout ${VLLM_VER} &> /dev/null && cd ../
-          fi
-          if [[ $(grep -c "vllm-gaudi:" ${docker_compose_path}) != 0 ]]; then
-              git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork
-              # Get the latest tag
-              VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
-              echo "Check out vLLM tag ${VLLM_VER}"
-              git checkout ${VLLM_VER} &> /dev/null && cd ../
-          fi
-          git clone --depth 1 --branch ${{ inputs.opea_branch }} https://github.com/opea-project/GenAIComps.git
-          cd GenAIComps && git rev-parse HEAD && cd ../
-
-      - name: Build Image
-        uses: opea-project/validation/actions/image-build@main
-        with:
-          work_dir: ${{ github.workspace }}/${{ inputs.example }}/docker_image_build
-          docker_compose_path: ${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml
-          service_list: ${{ inputs.services }}
-          registry: ${OPEA_IMAGE_REPO}opea
-          inject_commit: ${{ inputs.inject_commit }}
-          tag: ${{ inputs.tag }}
--- a/.github/workflows/_example-workflow.yml
+++ b/.github/workflows/_example-workflow.yml
@@ -53,23 +53,101 @@ jobs:
 ####################################################################################################
 # Image Build
 ####################################################################################################
+  pre-build-image-check:
+    runs-on: ubuntu-latest
+    outputs:
+      should_skip: ${{ steps.check-skip.outputs.should_skip }}
+    steps:
+      - name: Check if job should be skipped
+        id: check-skip
+        run: |
+          if [[ "${{ inputs.node }}" == "gaudi3" || "${{ inputs.node }}" == "rocm" || "${{ inputs.node }}" == "arc" ]]; then
+            echo "should_skip=true" >> $GITHUB_OUTPUT
+          else
+            echo "should_skip=false" >> $GITHUB_OUTPUT
+          fi
+
  build-images:
-    uses: ./.github/workflows/_build_image.yml
-    with:
-      node: ${{ inputs.node }}
-      build: ${{ fromJSON(inputs.build) }}
-      example: ${{ inputs.example }}
-      services: ${{ inputs.services }}
-      tag: ${{ inputs.tag }}
-      opea_branch: ${{ inputs.opea_branch }}
-      inject_commit: ${{ inputs.inject_commit }}
+    needs: [pre-build-image-check]
+    if: ${{ needs.pre-build-image-check.outputs.should_skip == 'false' }}
+    runs-on: "docker-build-${{ inputs.node }}"
+    steps:
+      - name: Clean Up Working Directory
+        run: sudo rm -rf ${{github.workspace}}/*
+
+      - name: Get Checkout Ref
+        run: |
+          if [ "${{ github.event_name }}" == "pull_request" ] || [ "${{ github.event_name }}" == "pull_request_target" ]; then
+            echo "CHECKOUT_REF=refs/pull/${{ github.event.number }}/merge" >> $GITHUB_ENV
+          else
+            echo "CHECKOUT_REF=${{ github.ref }}" >> $GITHUB_ENV
+          fi
+
+      - name: Checkout out GenAIExamples
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ env.CHECKOUT_REF }}
+          fetch-depth: 0
+
+      - name: Clone Required Repo
+        run: |
+          cd ${{ github.workspace }}/${{ inputs.example }}/docker_image_build
+          docker_compose_path=${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml
+          if [[ $(grep -c "vllm:" ${docker_compose_path}) != 0 ]]; then
+              git clone https://github.com/vllm-project/vllm.git && cd vllm
+              # Get the latest tag
+              VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+              echo "Check out vLLM tag ${VLLM_VER}"
+              git checkout ${VLLM_VER} &> /dev/null && cd ../
+          fi
+          if [[ $(grep -c "vllm-gaudi:" ${docker_compose_path}) != 0 ]]; then
+              git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork
+              # Get the latest tag
+              VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+              echo "Check out vLLM tag ${VLLM_VER}"
+              git checkout ${VLLM_VER} &> /dev/null && cd ../
+          fi
+          git clone --depth 1 --branch ${{ inputs.opea_branch }} https://github.com/opea-project/GenAIComps.git
+          cd GenAIComps && git rev-parse HEAD && cd ../
+
+      - name: Build Image
+        if: ${{ fromJSON(inputs.build) }}
+        uses: opea-project/validation/actions/image-build@main
+        with:
+          work_dir: ${{ github.workspace }}/${{ inputs.example }}/docker_image_build
+          docker_compose_path: ${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml
+          service_list: ${{ inputs.services }}
+          registry: ${OPEA_IMAGE_REPO}opea
+          inject_commit: ${{ inputs.inject_commit }}
+          tag: ${{ inputs.tag }}
+
+  pre-compose-test-check:
+    needs: [pre-build-image-check, build-images]
+    if: always()
+    runs-on: ubuntu-latest
+    outputs:
+      run_compose: ${{ steps.check-compose.outputs.run_compose }}
+    steps:
+      - name: Check if job should be skipped
+        id: check-compose
+        run: |
+          set -x
+          run_compose="false"
+          if [[ "${{ inputs.test_compose }}" == "true" ]]; then
+            if [[ "${{ needs.pre-build-image-check.outputs.should_skip }}" == "false" && "${{ needs.build-images.result}}" == "success" || "${{ needs.pre-build-image-check.outputs.should_skip }}" == "true" ]]; then
+              run_compose="true"
+            fi
+          fi
+          echo "run_compose=$run_compose"
+          echo "run_compose=$run_compose" >> $GITHUB_OUTPUT
+

 ####################################################################################################
 # Docker Compose Test
 ####################################################################################################
  test-example-compose:
-    needs: [build-images]
-    if: ${{ inputs.test_compose }}
+    needs: [pre-compose-test-check]
+    if: ${{ always() && needs.pre-compose-test-check.outputs.run_compose == 'true' }}
    uses: ./.github/workflows/_run-docker-compose.yml
    with:
      tag: ${{ inputs.tag }}
--- a/.github/workflows/scripts/docker_compose_clean_up.sh
+++ b/.github/workflows/scripts/docker_compose_clean_up.sh
@@ -30,20 +30,13 @@ case "$1" in
        echo "$ports"
        for port in $ports; do
          if [[ $port =~ [a-zA-Z_-] ]]; then
-            echo "Search port value $port from the test case..."
-            port_fix=$(grep -E "export $port=" tests/$test_case | cut -d'=' -f2)
-            if [[ "$port_fix" == "" ]]; then
-              echo "Can't find the port value from the test case, use the default value in yaml..."
-              port_fix=$(yq '.services[].ports[]' $yaml_file | grep $port | cut -d':' -f2 |  grep -o '[0-9a-zA-Z]\+')
-            fi
-            port=$port_fix
+            port=$(grep -E "export $port=" tests/$test_case | cut -d'=' -f2)
          fi
          if [[ $port =~ [0-9] ]]; then
            if [[ $port == 5000 ]]; then
              echo "Error: Port 5000 is used by local docker registry, please DO NOT use it in docker compose deployment!!!"
              exit 1
            fi
-            echo "Check port $port..."
            cid=$(docker ps --filter "publish=${port}" --format "{{.ID}}")
            if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && echo "release $port"; fi
          fi
--- a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
@@ -101,7 +101,7 @@ services:
      - "8080:8000"
    ipc: host
  agent-ui:
-    image: opea/agent-ui
+    image: opea:agent-ui
    container_name: agent-ui
    volumes:
      - ${WORKDIR}/GenAIExamples/AgentQnA/ui/svelte/.env:/home/user/svelte/.env # test db
--- a/AgentQnA/tests/step1_build_images.sh
+++ b/AgentQnA/tests/step1_build_images.sh
@@ -22,7 +22,7 @@ function build_docker_images_for_retrieval_tool(){
    echo "Build all the images with --no-cache..."
    service_list="doc-index-retriever dataprep embedding retriever reranking"
    docker compose -f build.yaml build ${service_list} --no-cache
-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5

    docker images && sleep 1s
 }
--- a/AudioQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/README.md
@@ -3,317 +3,104 @@
 This document outlines the deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice
 pipeline on server on AMD ROCm GPU platform.

-## Build Docker Images
+## 🚀 Build Docker images

-### 1. Build Docker Image
-
- #### Create application install directory and go to it:
-
-  ```bash
-  mkdir ~/audioqna-install && cd audioqna-install
-  ```
-
- #### Clone the repository GenAIExamples (the default repository branch "main" is used here):
-
-  ```bash
-  git clone https://github.com/opea-project/GenAIExamples.git
-  ```
-
-  If you need to use a specific branch/tag of the GenAIExamples repository, then (v1.3 replace with its own value):
-
-  ```bash
-  git clone https://github.com/opea-project/GenAIExamples.git && cd GenAIExamples && git checkout v1.3
-  ```
-
-  We remind you that when using a specific version of the code, you need to use the README from this version:
-
- #### Go to build directory:
-
-  ```bash
-  cd ~/audioqna-install/GenAIExamples/AudioQnA/docker_image_build
-  ```
-
- Cleaning up the GenAIComps repository if it was previously cloned in this directory.
-  This is necessary if the build was performed earlier and the GenAIComps folder exists and is not empty:
-
-  ```bash
-  echo Y | rm -R GenAIComps
-  ```
-
- #### Clone the repository GenAIComps (the default repository branch "main" is used here):
+### 1. Source Code install GenAIComps

 ```bash
 git clone https://github.com/opea-project/GenAIComps.git
 cd GenAIComps
 ```

-We remind you that when using a specific version of the code, you need to use the README from this version.
-
- #### Setting the list of images for the build (from the build file.yaml)
-
-  If you want to deploy a vLLM-based or TGI-based application, then the set of services is installed as follows:
-
-  #### vLLM-based application
-
-  ```bash
-  service_list="vllm-rocm whisper speecht5 audioqna audioqna-ui"
-  ```
-
-  #### TGI-based application
-
-  ```bash
-  service_list="whisper speecht5 audioqna audioqna-ui"
-  ```
-
- #### Optional. Pull TGI Docker Image (Do this if you want to use TGI)
-
-  ```bash
-  docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
-  ```
-
- #### Build Docker Images
-
-  ```bash
-  docker compose -f build.yaml build ${service_list} --no-cache
-  ```
-
-  After the build, we check the list of images with the command:
-
-  ```bash
-  docker image ls
-  ```
-
-  The list of images should include:
-
-  ##### vLLM-based application:
-
-  - opea/vllm-rocm:latest
-    - opea/whisper:latest
-    - opea/speecht5:latest
-    - opea/audioqna:latest
-
-  ##### TGI-based application:
-
-  - ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
-    - opea/whisper:latest
-    - opea/speecht5:latest
-    - opea/audioqna:latest
-
---
-
-## Deploy the AudioQnA Application
-
-### Docker Compose Configuration for AMD GPUs
-
-To enable GPU support for AMD GPUs, the following configuration is added to the Docker Compose file:
-
- compose_vllm.yaml - for vLLM-based application
- compose.yaml - for TGI-based
-
-```yaml
-shm_size: 1g
-devices:
-  - /dev/kfd:/dev/kfd
-  - /dev/dri/:/dev/dri/
-cap_add:
-  - SYS_PTRACE
-group_add:
-  - video
-security_opt:
-  - seccomp:unconfined
-```
-
-This configuration forwards all available GPUs to the container. To use a specific GPU, specify its `cardN` and `renderN` device IDs. For example:
-
-```yaml
-shm_size: 1g
-devices:
-  - /dev/kfd:/dev/kfd
-  - /dev/dri/card0:/dev/dri/card0
-  - /dev/dri/render128:/dev/dri/render128
-cap_add:
-  - SYS_PTRACE
-group_add:
-  - video
-security_opt:
-  - seccomp:unconfined
-```
-
-**How to Identify GPU Device IDs:**
-Use AMD GPU driver utilities to determine the correct `cardN` and `renderN` IDs for your GPU.
-
-### Set deploy environment variables
-
-#### Setting variables in the operating system environment:
-
-##### Set variable HUGGINGFACEHUB_API_TOKEN:
+### 2. Build ASR Image

 ```bash
-### Replace the string 'your_huggingfacehub_token' with your HuggingFacehub repository access token.
-export HUGGINGFACEHUB_API_TOKEN='your_huggingfacehub_token'
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .
 ```

-#### Set variables value in set_env\*\*\*\*.sh file:
+### 3. Build LLM Image

-Go to Docker Compose directory:
+For compose for ROCm example AMD optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm (https://github.com/huggingface/text-generation-inference)
+
+### 4. Build TTS Image

 ```bash
-cd ~/audioqna-install/GenAIExamples/AudioQnA/docker_compose/amd/gpu/rocm
+docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile .
 ```

-The example uses the Nano text editor. You can use any convenient text editor:
+### 5. Build MegaService Docker Image

-#### If you use vLLM
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `audioqna.py` Python script. Build the MegaService Docker image using the command below:

 ```bash
-nano set_env_vllm.sh
+git clone https://github.com/opea-project/GenAIExamples.git
+cd GenAIExamples/AudioQnA/
+docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
 ```

-#### If you use TGI
+Then run the command `docker images`, you will have following images ready:
+
+1. `opea/whisper:latest`
+2. `opea/speecht5:latest`
+3. `opea/audioqna:latest`
+
+## 🚀 Set the environment variables
+
+Before starting the services with `docker compose`, you have to recheck the following environment variables.

 ```bash
-nano set_env.sh
+export host_ip=<your External Public IP>    # export host_ip=$(hostname -I | awk '{print $1}')
+export HUGGINGFACEHUB_API_TOKEN=<your HF token>
+
+export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export WHISPER_SERVER_HOST_IP=${host_ip}
+export SPEECHT5_SERVER_HOST_IP=${host_ip}
+export LLM_SERVER_HOST_IP=${host_ip}
+
+export WHISPER_SERVER_PORT=7066
+export SPEECHT5_SERVER_PORT=7055
+export LLM_SERVER_PORT=3006
+
+export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
 ```

-If you are in a proxy environment, also set the proxy-related environment variables:
+or use set_env.sh file to setup environment variables.
+
+Note: Please replace with host_ip with your external IP address, do not use localhost.
+
+Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+Example for set isolation for 1 GPU
+
+      - /dev/dri/card0:/dev/dri/card0
+      - /dev/dri/renderD128:/dev/dri/renderD128
+
+Example for set isolation for 2 GPUs
+
+      - /dev/dri/card0:/dev/dri/card0
+      - /dev/dri/renderD128:/dev/dri/renderD128
+      - /dev/dri/card0:/dev/dri/card0
+      - /dev/dri/renderD129:/dev/dri/renderD129
+
+Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
+
+## 🚀 Start the MegaService

 ```bash
-export http_proxy="Your_HTTP_Proxy"
-export https_proxy="Your_HTTPs_Proxy"
+cd GenAIExamples/AudioQnA/docker_compose/amd/gpu/rocm/
+docker compose up -d
 ```

-Set the values of the variables:
+In following cases, you could build docker image from source by yourself.

- **HOST_IP, HOST_IP_EXTERNAL** - These variables are used to configure the name/address of the service in the operating system environment for the application services to interact with each other and with the outside world.
+- Failed to download the docker image.
+- If you want to use a specific version of Docker image.

-  If your server uses only an internal address and is not accessible from the Internet, then the values for these two variables will be the same and the value will be equal to the server's internal name/address.
+Please refer to 'Build Docker Images' in below.

-  If your server uses only an external, Internet-accessible address, then the values for these two variables will be the same and the value will be equal to the server's external name/address.
-
-  If your server is located on an internal network, has an internal address, but is accessible from the Internet via a proxy/firewall/load balancer, then the HOST_IP variable will have a value equal to the internal name/address of the server, and the EXTERNAL_HOST_IP variable will have a value equal to the external name/address of the proxy/firewall/load balancer behind which the server is located.
-
-  We set these values in the file set_env\*\*\*\*.sh
-
- **Variables with names like "**\*\*\*\*\*\*\_PORT"\*\* - These variables set the IP port numbers for establishing network connections to the application services.
-  The values shown in the file set_env.sh or set_env_vllm they are the values used for the development and testing of the application, as well as configured for the environment in which the development is performed. These values must be configured in accordance with the rules of network access to your environment's server, and must not overlap with the IP ports of other applications that are already in use.
-
-#### Set variables with script set_env\*\*\*\*.sh
-
-#### If you use vLLM
-
-```bash
-. set_env_vllm.sh
-```
-
-#### If you use TGI
-
-```bash
-. set_env.sh
-```
-
-### Start the services:
-
-#### If you use vLLM
-
-```bash
-docker compose -f compose_vllm.yaml up -d
-```
-
-#### If you use TGI
-
-```bash
-docker compose -f compose.yaml up -d
-```
-
-All containers should be running and should not restart:
-
-##### If you use vLLM:
-
- audioqna-vllm-service
- whisper-service
- speecht5-service
- audioqna-backend-server
- audioqna-ui-server
-
-##### If you use TGI:
-
- audioqna-tgi-service
- whisper-service
- speecht5-service
- audioqna-backend-server
- audioqna-ui-server
-
---
-
-## Validate the Services
-
-### 1. Validate the vLLM/TGI Service
-
-#### If you use vLLM:
-
-```bash
-DATA='{"model": "Intel/neural-chat-7b-v3-3t", '\
-'"messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 256}'
-
-curl http://${HOST_IP}:${AUDIOQNA_VLLM_SERVICE_PORT}/v1/chat/completions \
-  -X POST \
-  -d "$DATA" \
-  -H 'Content-Type: application/json'
-```
-
-Checking the response from the service. The response should be similar to JSON:
-
-```json
-{
-  "id": "chatcmpl-142f34ef35b64a8db3deedd170fed951",
-  "object": "chat.completion",
-  "created": 1742270316,
-  "model": "Intel/neural-chat-7b-v3-3",
-  "choices": [
-    {
-      "index": 0,
-      "message": {
-        "role": "assistant",
-        "content": "",
-        "tool_calls": []
-      },
-      "logprobs": null,
-      "finish_reason": "length",
-      "stop_reason": null
-    }
-  ],
-  "usage": { "prompt_tokens": 66, "total_tokens": 322, "completion_tokens": 256, "prompt_tokens_details": null },
-  "prompt_logprobs": null
-}
-```
-
-If the service response has a meaningful response in the value of the "choices.message.content" key,
-then we consider the vLLM service to be successfully launched
-
-#### If you use TGI:
-
-```bash
-DATA='{"inputs":"What is Deep Learning?",'\
-'"parameters":{"max_new_tokens":256,"do_sample": true}}'
-
-curl http://${HOST_IP}:${AUDIOQNA_TGI_SERVICE_PORT}/generate \
-  -X POST \
-  -d "$DATA" \
-  -H 'Content-Type: application/json'
-```
-
-Checking the response from the service. The response should be similar to JSON:
-
-```json
-{
-  "generated_text": " "
-}
-```
-
-If the service response has a meaningful response in the value of the "generated_text" key,
-then we consider the TGI service to be successfully launched
-
-### 2. Validate MegaServices
+## 🚀 Consume the AudioQnA Service

 Test the AudioQnA megaservice by recording a .wav file, encoding the file into the base64 format, and then sending the
 base64 string to the megaservice endpoint. The megaservice will return a spoken response as a base64 string. To listen
@@ -327,7 +114,7 @@ curl http://${host_ip}:3008/v1/audioqna \
  -H 'Content-Type: application/json' | sed 's/^"//;s/"$//' | base64 -d > output.wav
 ```

-### 3. Validate MicroServices
+## 🚀 Test MicroServices

 ```bash
 # whisper service
@@ -336,25 +123,15 @@ curl http://${host_ip}:7066/v1/asr \
  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
  -H 'Content-Type: application/json'

+# tgi service
+curl http://${host_ip}:3006/generate \
+  -X POST \
+  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
+  -H 'Content-Type: application/json'
+
 # speecht5 service
 curl http://${host_ip}:7055/v1/tts \
  -X POST \
  -d '{"text": "Who are you?"}' \
  -H 'Content-Type: application/json'
 ```
-
-### 4. Stop application
-
-#### If you use vLLM
-
-```bash
-cd ~/audioqna-install/GenAIExamples/AudioQnA/docker_compose/amd/gpu/rocm
-docker compose -f compose_vllm.yaml down
-```
-
-#### If you use TGI
-
-```bash
-cd ~/audioqna-install/GenAIExamples/AudioQnA/docker_compose/amd/gpu/rocm
-docker compose -f compose.yaml down
-```
--- a/AudioQnA/docker_compose/amd/gpu/rocm/compose_vllm.yaml
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/compose_vllm.yaml
@@ -1,101 +0,0 @@
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-services:
-  whisper-service:
-    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
-    container_name: whisper-service
-    ports:
-      - "${WHISPER_SERVER_PORT:-7066}:7066"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-    restart: unless-stopped
-
-  speecht5-service:
-    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
-    container_name: speecht5-service
-    ports:
-      - "${SPEECHT5_SERVER_PORT:-7055}:7055"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-    restart: unless-stopped
-
-  audioqna-vllm-service:
-    image: ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest}
-    container_name: audioqna-vllm-service
-    ports:
-      - "${VLLM_SERVICE_PORT:-8081}:8011"
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-      WILM_USE_TRITON_FLASH_ATTENTION: 0
-      PYTORCH_JIT: 0
-    volumes:
-      - "${HF_CACHE_DIR:-./data}:/data"
-    shm_size: 20G
-    devices:
-      - /dev/kfd:/dev/kfd
-      - /dev/dri/:/dev/dri/
-    cap_add:
-      - SYS_PTRACE
-    group_add:
-      - video
-    security_opt:
-      - seccomp:unconfined
-      - apparmor=unconfined
-    command: "--model ${LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 1 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
-    ipc: host
-
-  audioqna-backend-server:
-    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
-    container_name: audioqna-backend-server
-    depends_on:
-      - whisper-service
-      - audioqna-vllm-service
-      - speecht5-service
-    ports:
-      - "${BACKEND_SERVICE_PORT:-3008}:8888"
-    environment:
-      no_proxy: ${no_proxy}
-      https_proxy: ${https_proxy}
-      http_proxy: ${http_proxy}
-      MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP}
-      WHISPER_SERVER_HOST_IP: ${WHISPER_SERVER_HOST_IP}
-      WHISPER_SERVER_PORT: ${WHISPER_SERVER_PORT}
-      LLM_SERVER_HOST_IP: ${LLM_SERVER_HOST_IP}
-      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
-      LLM_MODEL_ID: ${LLM_MODEL_ID}
-      SPEECHT5_SERVER_HOST_IP: ${SPEECHT5_SERVER_HOST_IP}
-      SPEECHT5_SERVER_PORT: ${SPEECHT5_SERVER_PORT}
-    ipc: host
-    restart: always
-
-  audioqna-ui-server:
-    image: ${REGISTRY:-opea}/audioqna-ui:${TAG:-latest}
-    container_name: audioqna-ui-server
-    depends_on:
-      - audioqna-backend-server
-    ports:
-      - "${FRONTEND_SERVICE_PORT:-5173}:5173"
-    environment:
-      no_proxy: ${no_proxy}
-      https_proxy: ${https_proxy}
-      http_proxy: ${http_proxy}
-      CHAT_URL: ${BACKEND_SERVICE_ENDPOINT}
-    ipc: host
-    restart: always
-
-networks:
-  default:
-    driver: bridge
--- a/AudioQnA/docker_compose/amd/gpu/rocm/set_env_vllm.sh
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/set_env_vllm.sh
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash                                                                                                           set_env.sh
-
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-
-# export host_ip=<your External Public IP>    # export host_ip=$(hostname -I | awk '{print $1}')
-
-export host_ip=""
-export external_host_ip=""
-export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export HF_CACHE_DIR="./data"
-export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export VLLM_SERVICE_PORT="8081"
-
-export MEGA_SERVICE_HOST_IP=${host_ip}
-export WHISPER_SERVER_HOST_IP=${host_ip}
-export SPEECHT5_SERVER_HOST_IP=${host_ip}
-export LLM_SERVER_HOST_IP=${host_ip}
-
-export WHISPER_SERVER_PORT=7066
-export SPEECHT5_SERVER_PORT=7055
-export LLM_SERVER_PORT=${VLLM_SERVICE_PORT}
-export BACKEND_SERVICE_PORT=18038
-export FRONTEND_SERVICE_PORT=18039
-
-export BACKEND_SERVICE_ENDPOINT=http://${external_host_ip}:${BACKEND_SERVICE_PORT}/v1/audioqna
--- a/AudioQnA/docker_compose/intel/cpu/xeon/README_vllm.md
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/README_vllm.md
@@ -1,256 +0,0 @@
-Copyright (C) 2025 Advanced Micro Devices, Inc.
-
-# Deploy AudioQnA application
-
-## 1. Clone repo and build Docker images
-
-### 1.1. Cloning repo
-
-Create an empty directory in home directory and navigate to it:
-
-```bash
-mkdir -p ~/audioqna-test && cd ~/audioqna-test
-```
-
-Cloning GenAIExamples repo for build Docker images:
-
-```bash
-git clone https://github.com/opea-project/GenAIExamples.git
-```
-
-### 1.2. Navigate to repo directory and switching to the desired version of the code:
-
-If you are using the main branch, then you do not need to make the transition, the main branch is used by default
-
-```bash
-cd ~/searchqna-test/GenAIExamples/SearchQnA/docker_image_build
-git clone https://github.com/opea-project/GenAIComps.git
-```
-
-If you are using a specific branch or tag, then we perform git checkout to the desired version.
-
-```bash
-### Replace "v1.2" with the code version you need (branch or tag)
-cd cd ~/searchqna-test/GenAIExamples/SearchQnA/docker_image_build && git checkout v1.2
-git clone https://github.com/opea-project/GenAIComps.git
-```
-
-### 1.3. Build Docker images repo
-
-#### Build Docker image:
-
-```bash
-service_list="audioqna audioqna-ui whisper speecht5 vllm-rocm"
-docker compose -f build.yaml build --no-cache
-```
-
-### 1.4. Checking for the necessary Docker images
-
-After assembling the images, you can check their presence in the list of available images using the command:
-
-```bash
-docker image ls
-```
-
-The output of the command should contain images:
-
- opea/whisper:latest
- opea/speecht5:latest
- opea/vllm-rocm:latest
- opea/audioqna:latest
- opea/audioqna-ui:latest
-
-## 2. Set deploy environment variables
-
-### Setting variables in the operating system environment
-
-#### Set variables:
-
-```bash
-### Replace the string 'your_huggingfacehub_token' with your HuggingFacehub repository access token.
-export HUGGINGFACEHUB_API_TOKEN='your_huggingfacehub_token'
-```
-
-### Setting variables in the file set_env_vllm.sh
-
-```bash
-cd cd cd ~/searchqna-test/GenAIExamples/SearchQnA/docker_compose/amd/gpu/rocm
-### The example uses the Nano text editor. You can use any convenient text editor
-nano set_env_vllm.sh
-```
-
-Set the values of the variables:
-
- **host_ip, external_host_ip** - These variables are used to configure the name/address of the service in the operating system environment for the application services to interact with each other and with the outside world.
-
-  If your server uses only an internal address and is not accessible from the Internet, then the values for these two variables will be the same and the value will be equal to the server's internal name/address.
-
-  If your server uses only an external, Internet-accessible address, then the values for these two variables will be the same and the value will be equal to the server's external name/address.
-
-  If your server is located on an internal network, has an internal address, but is accessible from the Internet via a proxy/firewall/load balancer, then the host_ip variable will have a value equal to the internal name/address of the server, and the external_host_ip variable will have a value equal to the external name/address of the proxy/firewall/load balancer behind which the server is located.
-
-  We set these values in the file set_env_vllm.sh
-
- **Variables with names like "%%%%\_PORT"** - These variables set the IP port numbers for establishing network connections to the application services.
-  The values shown in the file set_env_vllm.sh they are the values used for the development and testing of the application, as well as configured for the environment in which the development is performed. These values must be configured in accordance with the rules of network access to your environment's server, and must not overlap with the IP ports of other applications that are already in use.
-
-If you are in a proxy environment, also set the proxy-related environment variables:
-
-```bash
-export http_proxy="Your_HTTP_Proxy"
-export https_proxy="Your_HTTPs_Proxy"
-```
-
-## 3. Deploy application
-
-### 3.1. Deploying applications using Docker Compose
-
-```bash
-cd cd ~/audioqna-test/GenAIExamples/AudioQnA/docker_compose/amd/gpu/rocm/
-docker compose -f compose_vllm up -d
-```
-
-After starting the containers, you need to view their status with the command:
-
-```bash
-docker ps
-```
-
-The following containers should be running:
-
- whisper-service
- speecht5-service
- audioqna-vllm-service
- audioqna-backend-server
- audioqna-ui-server
-
-Containers should not restart.
-
-#### 3.1.1. Configuring GPU forwarding
-
-By default, in the Docker Compose file, compose_vllm.yaml is configured to forward all GPUs to the audioqna-vllm-service container.
-To use certain GPUs, you need to configure the forwarding of certain devices from the host system to the container.
-The configuration must be done in:
-
-```yaml
-services:
-  #######
-  audioqna-vllm-service:
-    devices:
-```
-
-Example for set isolation for 1 GPU
-
-```
-      - /dev/dri/card0:/dev/dri/card0
-      - /dev/dri/renderD128:/dev/dri/renderD128
-```
-
-Example for set isolation for 2 GPUs
-
-```
-      - /dev/dri/card0:/dev/dri/card0
-      - /dev/dri/renderD128:/dev/dri/renderD128
-      - /dev/dri/card1:/dev/dri/card1
-      - /dev/dri/renderD129:/dev/dri/renderD129
-```
-
-### 3.2. Checking the application services
-
-#### 3.2.1. Checking audioqna-vllm-service
-
-Verification is performed in two ways:
-
- Checking the container logs
-
-  ```bash
-  docker logs audioqna-vllm-service
-  ```
-
-  A message like this should appear in the logs:
-
-  ```textmate
-  INFO:     Started server process [1]
-  INFO:     Waiting for application startup.
-  INFO:     Application startup complete.
-  INFO:     Uvicorn running on http://0.0.0.0:8011 (Press CTRL+C to quit)
-  ```
-
- Сhecking the response from the service
-  ```bash
-  ### curl request
-  ### Replace 18110 with the value set in the startup script in the variable VLLM_SERVICE_PORT
-  curl http://${host_ip}:${VLLM_SERVICE_PORT}/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-      "model": "Intel/neural-chat-7b-v3-3",
-      "prompt": "What is a Deep Learning?",
-      "max_tokens": 30,
-      "temperature": 0
-  }'
-  ```
-  The response from the service must be in the form of JSON:
-  ```json
-  {
-    "id": "cmpl-1d7d175d36d0491cba3abaa8b5bd6991",
-    "object": "text_completion",
-    "created": 1740411135,
-    "model": "Intel/neural-chat-7b-v3-3",
-    "choices": [
-      {
-        "index": 0,
-        "text": " Deep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is called \"deep\" because it",
-        "logprobs": null,
-        "finish_reason": "length",
-        "stop_reason": null,
-        "prompt_logprobs": null
-      }
-    ],
-    "usage": { "prompt_tokens": 7, "total_tokens": 37, "completion_tokens": 30, "prompt_tokens_details": null }
-  }
-  ```
-  The value of "choice.text" must contain a response from the service that makes sense.
-  If such a response is present, then the search-vllm-service is considered verified.
-
-#### 3.2.2. Checking whisper-service
-
-Сhecking the response from the service
-
-```bash
-wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
-curl http://${host_ip}:${WHISPER_SERVER_PORT}/v1/audio/transcriptions \
-  -H "Content-Type: multipart/form-data" \
-  -F file="@./sample.wav" \
-  -F model="openai/whisper-small"
-```
-
-The response from the service must be in the form of JSON:
-
-```json
-{ "text": "who is pat gelsinger" }
-```
-
-If the value of the text key is "who is pat gelsinger", then we consider the service to be successfully launched.
-
-#### 3.2.3. Checking speecht5-service
-
-Сhecking the response from the service
-
-```bash
-curl http://${host_ip}:${SPEECHT5_SERVER_PORT}/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
-```
-
-The result of the request is a speech.mp3 file. If you hear the phrase "Who are you?" while listening to the file, the service is considered successfully launched
-
-#### 3.2.4. Checking audioqna-backend-server
-
-Сhecking the response from the service
-
-```bash
-curl http://${host_ip}:${BACKEND_SERVICE_PORT}/v1/audioqna \
-  -X POST \
-  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64, "voice":"default"}' \
-  -H 'Content-Type: application/json' | sed 's/^"//;s/"$//' | base64 -d > output.wav
-```
-
-The result of the request is the output.wav file. If, when listening to it, you hear the answer that it is an assistant and a request for a new question, then the service is considered started.
--- a/AudioQnA/docker_image_build/build.yaml
+++ b/AudioQnA/docker_image_build/build.yaml
@@ -83,12 +83,3 @@ services:
      dockerfile: Dockerfile.hpu
    extends: audioqna
    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
-  vllm-rocm:
-    build:
-      args:
-        http_proxy: ${http_proxy}
-        https_proxy: ${https_proxy}
-        no_proxy: ${no_proxy}
-      context: GenAIComps
-      dockerfile: comps/third_parties/vllm/src/Dockerfile.amd_gpu
-    image: ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest}
--- a/AudioQnA/tests/test_compose_vllm_on_rocm.sh
+++ b/AudioQnA/tests/test_compose_vllm_on_rocm.sh
@@ -1,141 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-export REGISTRY=${IMAGE_REPO}
-export TAG=${IMAGE_TAG}
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-export PATH="~/miniconda3/bin:$PATH"
-
-function build_docker_images() {
-    opea_branch=${opea_branch:-"main"}
-    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
-    if [[ "${opea_branch}" != "main" ]]; then
-        cd $WORKPATH
-        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
-        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
-        find . -type f -name "Dockerfile*" | while read -r file; do
-            echo "Processing file: $file"
-            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
-        done
-    fi
-
-    cd $WORKPATH/docker_image_build
-    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-
-    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="audioqna audioqna-ui whisper speecht5 vllm-rocm"
-    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-    docker images && sleep 3s
-}
-
-function start_services() {
-    cd $WORKPATH/docker_compose/amd/gpu/rocm/
-
-    export host_ip=${ip_address}
-    export external_host_ip=${ip_address}
-    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export HF_CACHE_DIR="./data"
-    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-    export VLLM_SERVICE_PORT="8081"
-
-    export MEGA_SERVICE_HOST_IP=${host_ip}
-    export WHISPER_SERVER_HOST_IP=${host_ip}
-    export SPEECHT5_SERVER_HOST_IP=${host_ip}
-    export LLM_SERVER_HOST_IP=${host_ip}
-
-    export WHISPER_SERVER_PORT=7066
-    export SPEECHT5_SERVER_PORT=7055
-    export LLM_SERVER_PORT=${VLLM_SERVICE_PORT}
-    export BACKEND_SERVICE_PORT=3008
-    export FRONTEND_SERVICE_PORT=5173
-
-    export BACKEND_SERVICE_ENDPOINT=http://${external_host_ip}:${BACKEND_SERVICE_PORT}/v1/audioqna
-
-    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
-
-    # Start Docker Containers
-    docker compose -f compose_vllm.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
-    n=0
-    until [[ "$n" -ge 200 ]]; do
-       docker logs audioqna-vllm-service >& $LOG_PATH/vllm_service_start.log
-       if grep -q "Application startup complete" $LOG_PATH/vllm_service_start.log; then
-           break
-       fi
-       sleep 10s
-       n=$((n+1))
-    done
-}
-function validate_megaservice() {
-    response=$(http_proxy="" curl http://${ip_address}:${BACKEND_SERVICE_PORT}/v1/audioqna -XPOST -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64}' -H 'Content-Type: application/json')
-    # always print the log
-    docker logs whisper-service > $LOG_PATH/whisper-service.log
-    docker logs speecht5-service > $LOG_PATH/tts-service.log
-    docker logs audioqna-vllm-service > $LOG_PATH/audioqna-vllm-service.log
-    docker logs audioqna-backend-server > $LOG_PATH/audioqna-backend-server.log
-    echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
-
-    if [[ $(file speech.mp3) == *"RIFF"* ]]; then
-        echo "Result correct."
-    else
-        echo "Result wrong."
-        exit 1
-    fi
-
-}
-
-#function validate_frontend() {
-## Frontend tests are currently disabled
-#    cd $WORKPATH/ui/svelte
-#    local conda_env_name="OPEA_e2e"
-#    export PATH=${HOME}/miniforge3/bin/:$PATH
-##    conda remove -n ${conda_env_name} --all -y
-##    conda create -n ${conda_env_name} python=3.12 -y
-#    source activate ${conda_env_name}
-#
-#    sed -i "s/localhost/$ip_address/g" playwright.config.ts
-#
-##    conda install -c conda-forge nodejs -y
-#    npm install && npm ci && npx playwright install --with-deps
-#    node -v && npm -v && pip list
-#
-#    exit_status=0
-#    npx playwright test || exit_status=$?
-#
-#    if [ $exit_status -ne 0 ]; then
-#        echo "[TEST INFO]: ---------frontend test failed---------"
-#        exit $exit_status
-#    else
-#        echo "[TEST INFO]: ---------frontend test passed---------"
-#    fi
-#}
-
-function stop_docker() {
-    cd $WORKPATH/docker_compose/amd/gpu/rocm/
-    docker compose -f compose_vllm.yaml stop && docker compose -f compose_vllm.yaml rm -f
-}
-
-function main() {
-
-    stop_docker
-    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    start_services
-
-    validate_megaservice
-    # Frontend tests are currently disabled
-    # validate_frontend
-
-    stop_docker
-    echo y | docker system prune
-
-}
-
-main
--- a/ChatQnA/docker_compose/intel/cpu/aipc/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/aipc/compose.yaml
@@ -26,7 +26,7 @@ services:
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-server
    ports:
      - "6006:80"
@@ -59,7 +59,7 @@ services:
      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
    restart: unless-stopped
  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-reranking-server
    ports:
      - "8808:80"
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -33,7 +33,7 @@ services:
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-server
    ports:
      - "6006:80"
@@ -66,7 +66,7 @@ services:
      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
    restart: unless-stopped
  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-reranking-server
    ports:
      - "8808:80"
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen.yaml
@@ -26,7 +26,7 @@ services:
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-server
    ports:
      - "6006:80"
@@ -59,7 +59,7 @@ services:
      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
    restart: unless-stopped
  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-reranking-server
    ports:
      - "8808:80"
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen_tgi.yaml
@@ -26,7 +26,7 @@ services:
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-server
    ports:
      - "6006:80"
@@ -59,7 +59,7 @@ services:
      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
    restart: unless-stopped
  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-reranking-server
    ports:
      - "8808:80"
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
@@ -108,7 +108,7 @@ services:
    restart: unless-stopped

  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-server
    ports:
      - "6006:80"
@@ -122,7 +122,7 @@ services:
    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate

  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-reranking-server
    ports:
      - "8808:80"
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
@@ -23,7 +23,7 @@ services:
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_PINECONE"
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-server
    ports:
      - "6006:80"
@@ -54,7 +54,7 @@ services:
      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_PINECONE"
    restart: unless-stopped
  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-reranking-server
    ports:
      - "8808:80"
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -26,7 +26,7 @@ services:
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-server
    ports:
      - "6006:80"
@@ -59,7 +59,7 @@ services:
      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
    restart: unless-stopped
  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-reranking-server
    ports:
      - "8808:80"
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
@@ -26,7 +26,7 @@ services:
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-server
    ports:
      - "6006:80"
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -95,7 +95,7 @@ d560c232b120   opea/retriever:latest
 a1d7ca2d3787   ghcr.io/huggingface/tei-gaudi:1.5.0                                                             "text-embeddings-rou…"   2 minutes ago   Up 2 minutes                0.0.0.0:8808->80/tcp, [::]:8808->80/tcp                                                tei-reranking-gaudi-server
 9a9f3fd4fd4c   opea/vllm-gaudi:latest                                                                          "python3 -m vllm.ent…"   2 minutes ago   Exited (1) 2 minutes ago                                                                                           vllm-gaudi-server
 1ab9bbdf5182   redis/redis-stack:7.2.0-v9                                                                      "/entrypoint.sh"         2 minutes ago   Up 2 minutes                0.0.0.0:6379->6379/tcp, :::6379->6379/tcp, 0.0.0.0:8001->8001/tcp, :::8001->8001/tcp   redis-vector-db
-9ee0789d819e   ghcr.io/huggingface/text-embeddings-inference:cpu-1.6                                           "text-embeddings-rou…"   2 minutes ago   Up 2 minutes                0.0.0.0:8090->80/tcp, [::]:8090->80/tcp                                                tei-embedding-gaudi-server
+9ee0789d819e   ghcr.io/huggingface/text-embeddings-inference:cpu-1.5                                           "text-embeddings-rou…"   2 minutes ago   Up 2 minutes                0.0.0.0:8090->80/tcp, [::]:8090->80/tcp                                                tei-embedding-gaudi-server
 ```

 ### Test the Pipeline
@@ -148,7 +148,7 @@ The default deployment utilizes Gaudi devices primarily for the `vllm-service`,
 | ---------------------------- | ----------------------------------------------------- | ------------ |
 | redis-vector-db              | redis/redis-stack:7.2.0-v9                            | No           |
 | dataprep-redis-service       | opea/dataprep:latest                                  | No           |
-| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.6 | No           |
+| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 | No           |
 | retriever                    | opea/retriever:latest                                 | No           |
 | tei-reranking-service        | ghcr.io/huggingface/tei-gaudi:1.5.0                   | 1 card       |
 | vllm-service                 | opea/vllm-gaudi:latest                                | Configurable |
@@ -164,7 +164,7 @@ The TGI (Text Generation Inference) deployment and the default deployment differ
 | ---------------------------- | ----------------------------------------------------- | -------------- |
 | redis-vector-db              | redis/redis-stack:7.2.0-v9                            | No             |
 | dataprep-redis-service       | opea/dataprep:latest                                  | No             |
-| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.6 | No             |
+| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 | No             |
 | retriever                    | opea/retriever:latest                                 | No             |
 | tei-reranking-service        | ghcr.io/huggingface/tei-gaudi:1.5.0                   | 1 card         |
 | **tgi-service**              | ghcr.io/huggingface/tgi-gaudi:2.0.6                   | Configurable   |
@@ -184,7 +184,7 @@ The TGI (Text Generation Inference) deployment and the default deployment differ
 | ---------------------------- | ----------------------------------------------------- | ------------ |
 | redis-vector-db              | redis/redis-stack:7.2.0-v9                            | No           |
 | dataprep-redis-service       | opea/dataprep:latest                                  | No           |
-| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.6 | No           |
+| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 | No           |
 | retriever                    | opea/retriever:latest                                 | No           |
 | tei-reranking-service        | ghcr.io/huggingface/tei-gaudi:1.5.0                   | 1 card       |
 | vllm-service                 | opea/vllm-gaudi:latest                                | Configurable |
@@ -203,7 +203,7 @@ The _compose_without_rerank.yaml_ Docker Compose file is distinct from the defau
 | ---------------------------- | ----------------------------------------------------- | -------------- |
 | redis-vector-db              | redis/redis-stack:7.2.0-v9                            | No             |
 | dataprep-redis-service       | opea/dataprep:latest                                  | No             |
-| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.6 | No             |
+| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 | No             |
 | retriever                    | opea/retriever:latest                                 | No             |
 | vllm-service                 | opea/vllm-gaudi:latest                                | Configurable   |
 | chatqna-gaudi-backend-server | opea/chatqna:latest                                   | No             |
@@ -222,7 +222,7 @@ The _compose_guardrails.yaml_ Docker Compose file introduces enhancements over t
 | dataprep-redis-service       | opea/dataprep:latest                                  | No             | No       |
 | _tgi-guardrails-service_     | ghcr.io/huggingface/tgi-gaudi:2.0.6                   | 1 card         | Yes      |
 | _guardrails_                 | opea/guardrails:latest                                | No             | No       |
-| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.6 | No             | No       |
+| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 | No             | No       |
 | retriever                    | opea/retriever:latest                                 | No             | No       |
 | tei-reranking-service        | ghcr.io/huggingface/tei-gaudi:1.5.0                   | 1 card         | No       |
 | vllm-service                 | opea/vllm-gaudi:latest                                | Configurable   | Yes      |
@@ -258,7 +258,7 @@ The table provides a comprehensive overview of the ChatQnA services utilized acr
 | ---------------------------- | ----------------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------- |
 | redis-vector-db              | redis/redis-stack:7.2.0-v9                            | No       | Acts as a Redis database for storing and managing data.                                            |
 | dataprep-redis-service       | opea/dataprep:latest                                  | No       | Prepares data and interacts with the Redis database.                                               |
-| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.6 | No       | Provides text embedding services, often using Hugging Face models.                                 |
+| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 | No       | Provides text embedding services, often using Hugging Face models.                                 |
 | retriever                    | opea/retriever:latest                                 | No       | Retrieves data from the Redis database and interacts with embedding services.                      |
 | tei-reranking-service        | ghcr.io/huggingface/tei-gaudi:1.5.0                   | Yes      | Reranks text embeddings, typically using Gaudi hardware for enhanced performance.                  |
 | vllm-service                 | opea/vllm-gaudi:latest                                | No       | Handles large language model (LLM) tasks, utilizing Gaudi hardware.                                |
@@ -284,7 +284,7 @@ ChatQnA now supports running the latest DeepSeek models, including [deepseek-ai/

 ### tei-embedding-service & tei-reranking-service

-The `ghcr.io/huggingface/text-embeddings-inference:cpu-1.6` image supporting `tei-embedding-service` and `tei-reranking-service` depends on the `EMBEDDING_MODEL_ID` or `RERANK_MODEL_ID` environment variables respectively to specify the embedding model and reranking model used for converting text into vector representations and rankings. This choice impacts the quality and relevance of the embeddings rerankings for various applications. Unlike the `vllm-service`, the `tei-embedding-service` and `tei-reranking-service` each typically acquires only one Gaudi device and does not use the `NUM_CARDS` parameter; embedding and reranking tasks generally do not require extensive parallel processing and one Gaudi per service is appropriate. The list of [supported embedding and reranking models](https://github.com/huggingface/tei-gaudi?tab=readme-ov-file#supported-models) can be found at the the [huggingface/tei-gaudi](https://github.com/huggingface/tei-gaudi?tab=readme-ov-file#supported-models) website.
+The `ghcr.io/huggingface/text-embeddings-inference:cpu-1.5` image supporting `tei-embedding-service` and `tei-reranking-service` depends on the `EMBEDDING_MODEL_ID` or `RERANK_MODEL_ID` environment variables respectively to specify the embedding model and reranking model used for converting text into vector representations and rankings. This choice impacts the quality and relevance of the embeddings rerankings for various applications. Unlike the `vllm-service`, the `tei-embedding-service` and `tei-reranking-service` each typically acquires only one Gaudi device and does not use the `NUM_CARDS` parameter; embedding and reranking tasks generally do not require extensive parallel processing and one Gaudi per service is appropriate. The list of [supported embedding and reranking models](https://github.com/huggingface/tei-gaudi?tab=readme-ov-file#supported-models) can be found at the the [huggingface/tei-gaudi](https://github.com/huggingface/tei-gaudi?tab=readme-ov-file#supported-models) website.

 ### tgi-gaurdrails-service

--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -33,7 +33,7 @@ services:
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-gaudi-server
    ports:
      - "8090:80"
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_faqgen.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_faqgen.yaml
@@ -27,7 +27,7 @@ services:
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      LOGFLAG: ${LOGFLAG}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-gaudi-server
    ports:
      - "8090:80"
@@ -102,7 +102,7 @@ services:
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
      interval: 10s
-      timeout: 60s
+      timeout: 10s
      retries: 100
    runtime: habana
    cap_add:
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_faqgen_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_faqgen_tgi.yaml
@@ -27,7 +27,7 @@ services:
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      LOGFLAG: ${LOGFLAG}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-gaudi-server
    ports:
      - "8090:80"
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
@@ -65,7 +65,7 @@ services:
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    restart: unless-stopped
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-gaudi-server
    ports:
      - "8090:80"
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -26,7 +26,7 @@ services:
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-gaudi-server
    ports:
      - "8090:80"
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
@@ -26,7 +26,7 @@ services:
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-gaudi-server
    ports:
      - "8090:80"
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md
@@ -51,7 +51,7 @@ f810f3b4d329   opea/embedding:latest                               "python embed
 174bd43fa6b5   ghcr.io/huggingface/tei-gaudi:1.5.0                    "text-embeddings-rou…"   2 minutes ago   Up 2 minutes                    0.0.0.0:8090->80/tcp, :::8090->80/tcp                                                  tei-embedding-gaudi-server
 05c40b636239   ghcr.io/huggingface/tgi-gaudi:2.0.6                     "text-generation-lau…"   2 minutes ago   Exited (1) About a minute ago                                                                                          tgi-gaudi-server
 74084469aa33   redis/redis-stack:7.2.0-v9                              "/entrypoint.sh"         2 minutes ago   Up 2 minutes                    0.0.0.0:6379->6379/tcp, :::6379->6379/tcp, 0.0.0.0:8001->8001/tcp, :::8001->8001/tcp   redis-vector-db
-88399dbc9e43   ghcr.io/huggingface/text-embeddings-inference:cpu-1.6   "text-embeddings-rou…"   2 minutes ago   Up 2 minutes                    0.0.0.0:8808->80/tcp, :::8808->80/tcp                                                  tei-reranking-gaudi-server
+88399dbc9e43   ghcr.io/huggingface/text-embeddings-inference:cpu-1.5   "text-embeddings-rou…"   2 minutes ago   Up 2 minutes                    0.0.0.0:8808->80/tcp, :::8808->80/tcp                                                  tei-reranking-gaudi-server
 ```

 In this case, `ghcr.io/huggingface/tgi-gaudi:2.0.6` Existed.
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -4,20 +4,12 @@
 # SPDX-License-Identifier: Apache-2.0

 # Function to prompt for input and set environment variables
-NON_INTERACTIVE=${NON_INTERACTIVE:-false}
-
 prompt_for_env_var() {
  local var_name="$1"
  local prompt_message="$2"
  local default_value="$3"
  local mandatory="$4"

-  if [[ "$NON_INTERACTIVE" == "true" ]]; then
-    echo "Non-interactive environment detected. Setting $var_name to default: $default_value"
-    export "$var_name"="$default_value"
-    return
-  fi
-
  if [[ "$mandatory" == "true" ]]; then
    while [[ -z "$value" ]]; do
      read -p "$prompt_message [default: \"${default_value}\"]: " value
@@ -42,7 +34,7 @@ popd > /dev/null

 # Prompt the user for each required environment variable
 prompt_for_env_var "EMBEDDING_MODEL_ID" "Enter the EMBEDDING_MODEL_ID" "BAAI/bge-base-en-v1.5" false
-prompt_for_env_var "HUGGINGFACEHUB_API_TOKEN" "Enter the HUGGINGFACEHUB_API_TOKEN" "${HF_TOKEN}" true
+prompt_for_env_var "HUGGINGFACEHUB_API_TOKEN" "Enter the HUGGINGFACEHUB_API_TOKEN" "" true
 prompt_for_env_var "RERANK_MODEL_ID" "Enter the RERANK_MODEL_ID" "BAAI/bge-reranker-base" false
 prompt_for_env_var "LLM_MODEL_ID" "Enter the LLM_MODEL_ID" "meta-llama/Meta-Llama-3-8B-Instruct" false
 prompt_for_env_var "INDEX_NAME" "Enter the INDEX_NAME" "rag-redis" false
@@ -50,39 +42,32 @@ prompt_for_env_var "NUM_CARDS" "Enter the number of Gaudi devices" "1" false
 prompt_for_env_var "host_ip" "Enter the host_ip" "$(curl ifconfig.me)" false

 #Query for enabling http_proxy
-prompt_for_env_var "http_proxy" "Enter the http_proxy." "${http_proxy}" false
+prompt_for_env_var "http_proxy" "Enter the http_proxy." "" false

 #Query for enabling https_proxy
-prompt_for_env_var "https_proxy" "Enter the https_proxy." "${https_proxy}" false
+prompt_for_env_var "https_proxy" "Enter the https_proxy." "" false

 #Query for enabling no_proxy
-prompt_for_env_var "no_proxy" "Enter the no_proxy." "${no_proxy}" false
+prompt_for_env_var "no_proxy" "Enter the no_proxy." "" false

-if [[ "$NON_INTERACTIVE" == "true" ]]; then
-  # Query for enabling logging
-  prompt_for_env_var "LOGFLAG" "Enable logging? (yes/no): " "true" false
+# Query for enabling logging
+read -p "Enable logging? (yes/no): " logging && logging=$(echo "$logging" | tr '[:upper:]' '[:lower:]')
+if [[ "$logging" == "yes" || "$logging" == "y" ]]; then
+  export LOGFLAG=true
 else
-  # Query for enabling logging
-  read -p "Enable logging? (yes/no): " logging && logging=$(echo "$logging" | tr '[:upper:]' '[:lower:]')
-  if [[ "$logging" == "yes" || "$logging" == "y" ]]; then
-    export LOGFLAG=true
-  else
-    export LOGFLAG=false
-  fi
-
-  # Query for enabling OpenTelemetry Tracing Endpoint
-  read -p "Enable OpenTelemetry Tracing Endpoint? (yes/no): " telemetry && telemetry=$(echo "$telemetry" | tr '[:upper:]' '[:lower:]')
-  if [[ "$telemetry" == "yes" || "$telemetry" == "y" ]]; then
-      export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
-      export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
-      export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
-      telemetry_flag=true
-  else
-      telemetry_flag=false
-  fi
+  export LOGFLAG=false
 fi

-
+# Query for enabling OpenTelemetry Tracing Endpoint
+read -p "Enable OpenTelemetry Tracing Endpoint? (yes/no): " telemetry && telemetry=$(echo "$telemetry" | tr '[:upper:]' '[:lower:]')
+if [[ "$telemetry" == "yes" || "$telemetry" == "y" ]]; then
+    export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
+    telemetry_flag=true
+else
+    telemetry_flag=false
+fi

 # Generate the .env file
 cat <<EOF > .env
--- a/ChatQnA/docker_compose/nvidia/gpu/compose.yaml
+++ b/ChatQnA/docker_compose/nvidia/gpu/compose.yaml
@@ -26,7 +26,7 @@ services:
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-server
    ports:
      - "8090:80"
--- a/ChatQnA/kubernetes/gmc/README.md
+++ b/ChatQnA/kubernetes/gmc/README.md
@@ -15,9 +15,9 @@ The ChatQnA application is defined as a Custom Resource (CR) file that the above
 The ChatQnA uses the below prebuilt images if you choose a Xeon deployment

 - redis-vector-db: redis/redis-stack:7.2.0-v9
- tei_embedding_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+- tei_embedding_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
 - retriever: opea/retriever:latest
- tei_xeon_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+- tei_xeon_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
 - tgi-service: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
 - chaqna-xeon-backend-server: opea/chatqna:latest

--- a/ChatQnA/tests/test_compose_faqgen_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_faqgen_on_gaudi.sh
@@ -13,7 +13,7 @@ export MODEL_CACHE=${model_cache:-"/data/cache"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
-host_ip=$(hostname -I | awk '{print $1}')
+ip_address=$(hostname -I | awk '{print $1}')

 function build_docker_images() {
    opea_branch=${opea_branch:-"main"}
@@ -37,24 +37,19 @@ function build_docker_images() {
    service_list="chatqna chatqna-ui dataprep retriever llm-faqgen vllm-gaudi nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    docker pull ghcr.io/huggingface/tei-gaudi:1.5.0
    docker images && sleep 1s
 }

 function start_services() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    export NON_INTERACTIVE=true
-    # export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
-    # export RERANK_MODEL_ID="BAAI/bge-reranker-base"
-    # export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
-    # export NUM_CARDS=1
-    # export INDEX_NAME="rag-redis"
-    # export host_ip=${host_ip}
-    # export LOGFLAG=True
-    # export http_proxy=${http_proxy}
-    # export https_proxy=${https_proxy}
-
+    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
+    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export NUM_CARDS=1
+    export INDEX_NAME="rag-redis"
+    export host_ip=${ip_address}
    export LLM_ENDPOINT_PORT=8010
    export LLM_SERVER_PORT=9001
    export CHATQNA_BACKEND_PORT=8888
@@ -66,9 +61,10 @@ function start_services() {
    export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
    export HF_TOKEN=${HF_TOKEN}
    export VLLM_SKIP_WARMUP=true
-
-    export no_proxy="${host_ip},redis-vector-db,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails,llm-faqgen,chatqna-gaudi-backend-server,chatqna-gaudi-ui-server,chatqna-gaudi-nginx-server"
-    source set_env.sh
+    export LOGFLAG=True
+    export http_proxy=${http_proxy}
+    export https_proxy=${https_proxy}
+    export no_proxy="${ip_address},redis-vector-db,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails,llm-faqgen,chatqna-gaudi-backend-server,chatqna-gaudi-ui-server,chatqna-gaudi-nginx-server"

    # Start Docker Containers
    docker compose -f compose_faqgen.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
@@ -127,8 +123,8 @@ function validate_microservices() {

    # tei for embedding service
    validate_service \
-        "${host_ip}:8090/embed" \
-        "]]" \
+        "${ip_address}:8090/embed" \
+        "[[" \
        "tei-embedding" \
        "tei-embedding-gaudi-server" \
        '{"inputs":"What is Deep Learning?"}'
@@ -138,28 +134,28 @@ function validate_microservices() {
    # test /v1/dataprep upload file
    echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
    validate_service \
-        "http://${host_ip}:6007/v1/dataprep/ingest" \
+        "http://${ip_address}:6007/v1/dataprep/ingest" \
        "Data preparation succeeded" \
        "dataprep_upload_file" \
        "dataprep-redis-server"

    # test /v1/dataprep upload link
    validate_service \
-        "http://${host_ip}:6007/v1/dataprep/ingest" \
+        "http://${ip_address}:6007/v1/dataprep/ingest" \
        "Data preparation succeeded" \
        "dataprep_upload_link" \
        "dataprep-redis-server"

    # test /v1/dataprep/get_file
    validate_service \
-        "http://${host_ip}:6007/v1/dataprep/get" \
+        "http://${ip_address}:6007/v1/dataprep/get" \
        '{"name":' \
        "dataprep_get" \
        "dataprep-redis-server"

    # test /v1/dataprep/delete_file
    validate_service \
-        "http://${host_ip}:6007/v1/dataprep/delete" \
+        "http://${ip_address}:6007/v1/dataprep/delete" \
        '{"status":true}' \
        "dataprep_del" \
        "dataprep-redis-server"
@@ -167,8 +163,8 @@ function validate_microservices() {
    # retrieval microservice
    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
    validate_service \
-        "${host_ip}:7000/v1/retrieval" \
-        "retrieved_docs" \
+        "${ip_address}:7000/v1/retrieval" \
+        " " \
        "retrieval" \
        "retriever-redis-server" \
        "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
@@ -176,7 +172,7 @@ function validate_microservices() {
    # tei for rerank microservice
    echo "validate tei..."
    validate_service \
-        "${host_ip}:8808/rerank" \
+        "${ip_address}:8808/rerank" \
        '{"index":1,"score":' \
        "tei-rerank" \
        "tei-reranking-gaudi-server" \
@@ -185,7 +181,7 @@ function validate_microservices() {
    # vllm for llm service
    echo "validate vllm..."
    validate_service \
-        "${host_ip}:${LLM_ENDPOINT_PORT}/v1/chat/completions" \
+        "${ip_address}:${LLM_ENDPOINT_PORT}/v1/chat/completions" \
        "content" \
        "vllm-llm" \
        "vllm-gaudi-server" \
@@ -194,7 +190,7 @@ function validate_microservices() {
    # faqgen llm microservice
    echo "validate llm-faqgen..."
    validate_service \
-        "${host_ip}:${LLM_SERVER_PORT}/v1/faqgen" \
+        "${ip_address}:${LLM_SERVER_PORT}/v1/faqgen" \
        "text" \
        "llm" \
        "llm-faqgen-server" \
@@ -204,14 +200,14 @@ function validate_microservices() {
 function validate_megaservice() {
    # Curl the Mega Service
    validate_service \
-        "${host_ip}:${CHATQNA_BACKEND_PORT}/v1/chatqna" \
+        "${ip_address}:${CHATQNA_BACKEND_PORT}/v1/chatqna" \
        "Embed" \
        "chatqna-megaservice" \
        "chatqna-gaudi-backend-server" \
        '{"messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32}'

    validate_service \
-        "${host_ip}:${CHATQNA_BACKEND_PORT}/v1/chatqna" \
+        "${ip_address}:${CHATQNA_BACKEND_PORT}/v1/chatqna" \
        "Embed" \
        "chatqna-megaservice" \
        "chatqna-gaudi-backend-server" \
@@ -230,7 +226,7 @@ function validate_frontend() {
    fi
    source activate ${conda_env_name}

-    sed -i "s/localhost/$host_ip/g" playwright.config.ts
+    sed -i "s/localhost/$ip_address/g" playwright.config.ts

    conda install -c conda-forge nodejs=22.6.0 -y
    npm install && npm ci && npx playwright install --with-deps
--- a/ChatQnA/tests/test_compose_faqgen_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_faqgen_on_xeon.sh
@@ -38,7 +38,7 @@ function build_docker_images() {
    service_list="chatqna chatqna-ui dataprep retriever llm-faqgen vllm nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    docker images && sleep 1s
 }

--- a/ChatQnA/tests/test_compose_faqgen_tgi_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_faqgen_tgi_on_gaudi.sh
@@ -35,7 +35,7 @@ function build_docker_images() {
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    docker pull ghcr.io/huggingface/tei-gaudi:1.5.0
    docker images && sleep 1s
 }
--- a/ChatQnA/tests/test_compose_faqgen_tgi_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_faqgen_tgi_on_xeon.sh
@@ -39,7 +39,7 @@ function build_docker_images() {
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    docker images && sleep 1s
 }

--- a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
@@ -38,7 +38,7 @@ function build_docker_images() {
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    docker pull ghcr.io/huggingface/tei-gaudi:1.5.0

    docker images && sleep 1s
--- a/ChatQnA/tests/test_compose_milvus_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_milvus_on_xeon.sh
@@ -42,7 +42,7 @@ function build_docker_images() {
    service_list="chatqna chatqna-ui dataprep retriever vllm nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5

    docker images && sleep 1s
 }
--- a/ChatQnA/tests/test_compose_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_on_gaudi.sh
@@ -37,7 +37,7 @@ function build_docker_images() {
    service_list="chatqna chatqna-ui dataprep retriever vllm-gaudi nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    docker pull ghcr.io/huggingface/tei-gaudi:1.5.0
    docker images && sleep 1s
 }
--- a/ChatQnA/tests/test_compose_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_on_xeon.sh
@@ -41,7 +41,7 @@ function build_docker_images() {
    service_list="chatqna chatqna-ui dataprep retriever vllm nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5

    docker images && sleep 1s
 }
--- a/ChatQnA/tests/test_compose_pinecone_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_pinecone_on_xeon.sh
@@ -41,7 +41,7 @@ function build_docker_images() {
    service_list="chatqna chatqna-ui dataprep retriever vllm nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5

    docker images && sleep 1s
 }
--- a/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
@@ -36,7 +36,7 @@ function build_docker_images() {
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    docker pull ghcr.io/huggingface/tei-gaudi:1.5.0

    docker images && sleep 1s
--- a/ChatQnA/tests/test_compose_tgi_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_tgi_on_xeon.sh
@@ -36,7 +36,7 @@ function build_docker_images() {
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5

    docker images && sleep 1s
 }
--- a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
@@ -37,7 +37,7 @@ function build_docker_images() {
    service_list="chatqna chatqna-ui dataprep retriever vllm-gaudi nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    docker pull ghcr.io/huggingface/tei-gaudi:1.5.0

    docker images && sleep 1s
--- a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
@@ -42,7 +42,7 @@ function build_docker_images() {
    service_list="chatqna chatqna-ui dataprep retriever vllm nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5

    docker images && sleep 1s
 }
--- a/CodeGen/Dockerfile
+++ b/CodeGen/Dockerfile
@@ -1,8 +1,50 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-ARG BASE_TAG=latest
-FROM opea/comps-base:$BASE_TAG
+# Stage 1: base setup used by other stages
+FROM python:3.11-slim AS base
+
+# get security updates
+RUN apt-get update && apt-get upgrade -y && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+ENV HOME=/home/user
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p $HOME && \
+    chown -R user $HOME
+
+WORKDIR $HOME
+
+
+# Stage 2: latest GenAIComps sources
+FROM base AS git
+
+RUN apt-get update && apt-get install -y --no-install-recommends git
+RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
+
+
+# Stage 3: common layer shared by services using GenAIComps
+FROM base AS comps-base
+
+# copy just relevant parts
+COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
+COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
+
+WORKDIR $HOME/GenAIComps
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
+WORKDIR $HOME
+
+ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
+
+USER user
+
+
+# Stage 4: unique part
+FROM comps-base
+
+ENV LANG=C.UTF-8

 COPY ./codegen.py $HOME/codegen.py

--- a/CodeGen/assets/img/ui-result-page.png
+++ b/CodeGen/assets/img/ui-result-page.png
--- a/CodeGen/assets/img/ui-starting-page.png
+++ b/CodeGen/assets/img/ui-starting-page.png
--- a/CodeGen/docker_compose/amd/gpu/rocm/README.md
+++ b/CodeGen/docker_compose/amd/gpu/rocm/README.md
@@ -1,117 +1,47 @@
-# Build and Deploy CodeGen Application on AMD GPU (ROCm)
+# Build and deploy CodeGen Application on AMD GPU (ROCm)

-## Build Docker Images
+## Build images

-### 1. Build Docker Image
+### Build the LLM Docker Image

- #### Create application install directory and go to it:
+```bash
+### Cloning repo
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps

-  ```bash
-  mkdir ~/codegen-install && cd codegen-install
-  ```
+### Build Docker image
+docker build -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
+```

- #### Clone the repository GenAIExamples (the default repository branch "main" is used here):
+### Build the MegaService Docker Image

-  ```bash
-  git clone https://github.com/opea-project/GenAIExamples.git
-  ```
+```bash
+### Cloning repo
+git clone https://github.com/opea-project/GenAIExamples
+cd GenAIExamples/CodeGen

-  If you need to use a specific branch/tag of the GenAIExamples repository, then (v1.3 replace with its own value):
+### Build Docker image
+docker build -t opea/codegen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```

-  ```bash
-  git clone https://github.com/opea-project/GenAIExamples.git && cd GenAIExamples && git checkout v1.3
-  ```
+### Build the UI Docker Image

-  We remind you that when using a specific version of the code, you need to use the README from this version:
+```bash
+cd GenAIExamples/CodeGen/ui
+### Build UI Docker image
+docker build -t opea/codegen-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .

- #### Go to build directory:
+### Build React UI Docker image (React UI allows you to use file uploads)
+docker build --no-cache -t opea/codegen-react-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
+```

-  ```bash
-  cd ~/codegen-install/GenAIExamples/CodeGen/docker_image_build
-  ```
+It is recommended to use the React UI as it works for downloading files. The use of React UI is set in the Docker Compose file

- Cleaning up the GenAIComps repository if it was previously cloned in this directory.
-  This is necessary if the build was performed earlier and the GenAIComps folder exists and is not empty:
+## Deploy CodeGen Application

-  ```bash
-  echo Y | rm -R GenAIComps
-  ```
+### Features of Docker compose for AMD GPUs

- #### Clone the repository GenAIComps (the default repository branch "main" is used here):
-
-  ```bash
-  git clone https://github.com/opea-project/GenAIComps.git
-  ```
-
-  If you use a specific tag of the GenAIExamples repository,
-  then you should also use the corresponding tag for GenAIComps. (v1.3 replace with its own value):
-
-  ```bash
-  git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout v1.3
-  ```
-
-  We remind you that when using a specific version of the code, you need to use the README from this version.
-
- #### Setting the list of images for the build (from the build file.yaml)
-
-  If you want to deploy a vLLM-based or TGI-based application, then the set of services is installed as follows:
-
-  #### vLLM-based application
-
-  ```bash
-  service_list="vllm-rocm llm-textgen codegen codegen-ui"
-  ```
-
-  #### TGI-based application
-
-  ```bash
-  service_list="llm-textgen codegen codegen-ui"
-  ```
-
- #### Optional. Pull TGI Docker Image (Do this if you want to use TGI)
-
-  ```bash
-  docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
-  ```
-
- #### Build Docker Images
-
-  ```bash
-  docker compose -f build.yaml build ${service_list} --no-cache
-  ```
-
-  After the build, we check the list of images with the command:
-
-  ```bash
-  docker image ls
-  ```
-
-  The list of images should include:
-
-  ##### vLLM-based application:
-
-  - opea/vllm-rocm:latest
-  - opea/llm-textgen:latest
-  - opea/codegen:latest
-  - opea/codegen-ui:latest
-
-  ##### TGI-based application:
-
-  - ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
-  - opea/llm-textgen:latest
-  - opea/codegen:latest
-  - opea/codegen-ui:latest
-
---
-
-## Deploy the CodeGen Application
-
-### Docker Compose Configuration for AMD GPUs
-
-To enable GPU support for AMD GPUs, the following configuration is added to the Docker Compose file:
-
- compose_vllm.yaml - for vLLM-based application
- compose.yaml - for TGI-based
+1. Added forwarding of GPU devices to the container TGI service with instructions:

 ```yaml
 shm_size: 1g
@@ -126,7 +56,9 @@ security_opt:
  - seccomp:unconfined
 ```

-This configuration forwards all available GPUs to the container. To use a specific GPU, specify its `cardN` and `renderN` device IDs. For example:
+In this case, all GPUs are thrown. To reset a specific GPU, you need to use specific device names cardN and renderN.
+
+For example:

 ```yaml
 shm_size: 1g
@@ -142,284 +74,53 @@ security_opt:
  - seccomp:unconfined
 ```

-**How to Identify GPU Device IDs:**
-Use AMD GPU driver utilities to determine the correct `cardN` and `renderN` IDs for your GPU.
+To find out which GPU device IDs cardN and renderN correspond to the same GPU, use the GPU driver utility

-### Set deploy environment variables
-
-#### Setting variables in the operating system environment:
-
-##### Set variable HUGGINGFACEHUB_API_TOKEN:
+### Go to the directory with the Docker compose file

 ```bash
-### Replace the string 'your_huggingfacehub_token' with your HuggingFacehub repository access token.
-export HUGGINGFACEHUB_API_TOKEN='your_huggingfacehub_token'
+cd GenAIExamples/CodeGen/docker_compose/amd/gpu/rocm
 ```

-#### Set variables value in set_env\*\*\*\*.sh file:
+### Set environments

-Go to Docker Compose directory:
-
-```bash
-cd ~/codegen-install/GenAIExamples/CodeGen/docker_compose/amd/gpu/rocm
-```
-
-The example uses the Nano text editor. You can use any convenient text editor:
-
-#### If you use vLLM
-
-```bash
-nano set_env_vllm.sh
-```
-
-#### If you use TGI
-
-```bash
-nano set_env.sh
-```
-
-If you are in a proxy environment, also set the proxy-related environment variables:
-
-```bash
-export http_proxy="Your_HTTP_Proxy"
-export https_proxy="Your_HTTPs_Proxy"
-```
-
-Set the values of the variables:
-
- **HOST_IP, HOST_IP_EXTERNAL** - These variables are used to configure the name/address of the service in the operating system environment for the application services to interact with each other and with the outside world.
-
-  If your server uses only an internal address and is not accessible from the Internet, then the values for these two variables will be the same and the value will be equal to the server's internal name/address.
-
-  If your server uses only an external, Internet-accessible address, then the values for these two variables will be the same and the value will be equal to the server's external name/address.
-
-  If your server is located on an internal network, has an internal address, but is accessible from the Internet via a proxy/firewall/load balancer, then the HOST_IP variable will have a value equal to the internal name/address of the server, and the EXTERNAL_HOST_IP variable will have a value equal to the external name/address of the proxy/firewall/load balancer behind which the server is located.
-
-  We set these values in the file set_env\*\*\*\*.sh
-
- **Variables with names like "**\*\*\*\*\*\*\_PORT"\*\* - These variables set the IP port numbers for establishing network connections to the application services.
-  The values shown in the file set_env.sh or set_env_vllm they are the values used for the development and testing of the application, as well as configured for the environment in which the development is performed. These values must be configured in accordance with the rules of network access to your environment's server, and must not overlap with the IP ports of other applications that are already in use.
-
-#### Set variables with script set_env\*\*\*\*.sh
-
-#### If you use vLLM
-
-```bash
-. set_env_vllm.sh
-```
-
-#### If you use TGI
+In the file "GenAIExamples/CodeGen/docker_compose/amd/gpu/rocm/set_env.sh " it is necessary to set the required values. Parameter assignments are specified in the comments for each variable setting command

 ```bash
+chmod +x set_env.sh
 . set_env.sh
 ```

-### Start the services:
+### Run services

-#### If you use vLLM
-
-```bash
-docker compose -f compose_vllm.yaml up -d
+```
+docker compose up -d
 ```

-#### If you use TGI
+# Validate the MicroServices and MegaService
+
+## Validate TGI service

 ```bash
-docker compose -f compose.yaml up -d
-```
-
-All containers should be running and should not restart:
-
-##### If you use vLLM:
-
- codegen-vllm-service
- codegen-llm-server
- codegen-backend-server
- codegen-ui-server
-
-##### If you use TGI:
-
- codegen-tgi-service
- codegen-llm-server
- codegen-backend-server
- codegen-ui-server
-
---
-
-## Validate the Services
-
-### 1. Validate the vLLM/TGI Service
-
-#### If you use vLLM:
-
-```bash
-DATA='{"model": "Qwen/Qwen2.5-Coder-7B-Instruct", '\
-'"messages": [{"role": "user", "content": "Implement a high-level API for a TODO list application. '\
-'The API takes as input an operation request and updates the TODO list in place. '\
-'If the request is invalid, raise an exception."}], "max_tokens": 256}'
-
-curl http://${HOST_IP}:${CODEGEN_VLLM_SERVICE_PORT}/v1/chat/completions \
-  -X POST \
-  -d "$DATA" \
-  -H 'Content-Type: application/json'
-```
-
-Checking the response from the service. The response should be similar to JSON:
-
-````json
-{
-  "id": "chatcmpl-142f34ef35b64a8db3deedd170fed951",
-  "object": "chat.completion",
-  "created": 1742270316,
-  "model": "Qwen/Qwen2.5-Coder-7B-Instruct",
-  "choices": [
-    {
-      "index": 0,
-      "message": {
-        "role": "assistant",
-        "content": "```python\nfrom typing import Optional, List, Dict, Union\nfrom pydantic import BaseModel, validator\n\nclass OperationRequest(BaseModel):\n    # Assuming OperationRequest is already defined as per the given text\n    pass\n\nclass UpdateOperation(OperationRequest):\n    new_items: List[str]\n\n    def apply_and_maybe_raise(self, updatable_item: \"Updatable todo list\") -> None:\n        # Assuming updatable_item is an instance of Updatable todo list\n        self.validate()\n        updatable_item.add_items(self.new_items)\n\nclass Updatable:\n    # Abstract class for items that can be updated\n    pass\n\nclass TodoList(Updatable):\n    # Class that represents a todo list\n    items: List[str]\n\n    def add_items(self, new_items: List[str]) -> None:\n        self.items.extend(new_items)\n\ndef handle_request(operation_request: OperationRequest) -> None:\n    # Function to handle an operation request\n    if isinstance(operation_request, UpdateOperation):\n        operation_request.apply_and_maybe_raise(get_todo_list_for_update())\n    else:\n        raise ValueError(\"Invalid operation request\")\n\ndef get_todo_list_for_update() -> TodoList:\n    # Function to get the todo list for update\n    # Assuming this function returns the",
-        "tool_calls": []
-      },
-      "logprobs": null,
-      "finish_reason": "length",
-      "stop_reason": null
-    }
-  ],
-  "usage": { "prompt_tokens": 66, "total_tokens": 322, "completion_tokens": 256, "prompt_tokens_details": null },
-  "prompt_logprobs": null
-}
-````
-
-If the service response has a meaningful response in the value of the "choices.message.content" key,
-then we consider the vLLM service to be successfully launched
-
-#### If you use TGI:
-
-```bash
-DATA='{"inputs":"Implement a high-level API for a TODO list application. '\
-'The API takes as input an operation request and updates the TODO list in place. '\
-'If the request is invalid, raise an exception.",'\
-'"parameters":{"max_new_tokens":256,"do_sample": true}}'
-
 curl http://${HOST_IP}:${CODEGEN_TGI_SERVICE_PORT}/generate \
  -X POST \
-  -d "$DATA" \
+  -d '{"inputs":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","parameters":{"max_new_tokens":256, "do_sample": true}}' \
  -H 'Content-Type: application/json'
 ```

-Checking the response from the service. The response should be similar to JSON:
-
-````json
-{
-  "generated_text": " The supported operations are \"add_task\", \"complete_task\", and \"remove_task\". Each operation can be defined with a corresponding function in the API.\n\nAdd your API in the following format:\n\n```\nTODO App API\n\nsupported operations:\n\noperation name           description\n-----------------------  ------------------------------------------------\n<operation_name>         <operation description>\n```\n\nUse type hints for function parameters and return values. Specify a text description of the API's supported operations.\n\nUse the following code snippet as a starting point for your high-level API function:\n\n```\nclass TodoAPI:\n    def __init__(self, tasks: List[str]):\n        self.tasks = tasks  # List of tasks to manage\n\n    def add_task(self, task: str) -> None:\n        self.tasks.append(task)\n\n    def complete_task(self, task: str) -> None:\n        self.tasks = [t for t in self.tasks if t != task]\n\n    def remove_task(self, task: str) -> None:\n        self.tasks = [t for t in self.tasks if t != task]\n\n    def handle_request(self, request: Dict[str, str]) -> None:\n        operation = request.get('operation')\n        if operation == 'add_task':\n            self.add_task(request.get('task'))\n        elif"
-}
-````
-
-If the service response has a meaningful response in the value of the "generated_text" key,
-then we consider the TGI service to be successfully launched
-
-### 2. Validate the LLM Service
+## Validate LLM service

 ```bash
-DATA='{"query":"Implement a high-level API for a TODO list application. '\
-'The API takes as input an operation request and updates the TODO list in place. '\
-'If the request is invalid, raise an exception.",'\
-'"max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,'\
-'"repetition_penalty":1.03,"stream":false}'
-
-curl http://${HOST_IP}:${CODEGEN_LLM_SERVICE_PORT}/v1/chat/completions \
+curl http://${HOST_IP}:${CODEGEN_LLM_SERVICE_PORT}/v1/chat/completions\
  -X POST \
-  -d "$DATA" \
+  -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' \
  -H 'Content-Type: application/json'
 ```

-Checking the response from the service. The response should be similar to JSON:
-
-````json
-{
-  "id": "cmpl-4e89a590b1af46bfb37ce8f12b2996f8",
-  "choices": [
-    {
-      "finish_reason": "length",
-      "index": 0,
-      "logprobs": null,
-      "text": " The API should support the following operations:\n\n1. Add a new task to the TODO list.\n2. Remove a task from the TODO list.\n3. Mark a task as completed.\n4. Retrieve the list of all tasks.\n\nThe API should also support the following features:\n\n1. The ability to filter tasks based on their completion status.\n2. The ability to sort tasks based on their priority.\n3. The ability to search for tasks based on their description.\n\nHere is an example of how the API can be used:\n\n```python\ntodo_list = []\napi = TodoListAPI(todo_list)\n\n# Add tasks\napi.add_task(\"Buy groceries\")\napi.add_task(\"Finish homework\")\n\n# Mark a task as completed\napi.mark_task_completed(\"Buy groceries\")\n\n# Retrieve the list of all tasks\nprint(api.get_all_tasks())\n\n# Filter tasks based on completion status\nprint(api.filter_tasks(completed=True))\n\n# Sort tasks based on priority\napi.sort_tasks(priority=\"high\")\n\n# Search for tasks based on description\nprint(api.search_tasks(description=\"homework\"))\n```\n\nIn this example, the `TodoListAPI` class is used to manage the TODO list. The `add_task` method adds a new task to the list, the `mark_task_completed` method",
-      "stop_reason": null,
-      "prompt_logprobs": null
-    }
-  ],
-  "created": 1742270567,
-  "model": "Qwen/Qwen2.5-Coder-7B-Instruct",
-  "object": "text_completion",
-  "system_fingerprint": null,
-  "usage": {
-    "completion_tokens": 256,
-    "prompt_tokens": 37,
-    "total_tokens": 293,
-    "completion_tokens_details": null,
-    "prompt_tokens_details": null
-  }
-}
-````
-
-If the service response has a meaningful response in the value of the "choices.text" key,
-then we consider the vLLM service to be successfully launched
-
-### 3. Validate the MegaService
+## Validate MegaService

 ```bash
-DATA='{"messages": "Implement a high-level API for a TODO list application. '\
-'The API takes as input an operation request and updates the TODO list in place. '\
-'If the request is invalid, raise an exception."}'
-
-curl http://${HOST_IP}:${CODEGEN_BACKEND_SERVICE_PORT}/v1/codegen \
-  -H "Content-Type: application/json" \
-  -d "$DATA"
-```
-
-Checking the response from the service. The response should be similar to text:
-
-```textmate
-data: {"id":"cmpl-cc5dc73819c640469f7c7c7424fe57e6","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":" of","stop_reason":null}],"created":1742270725,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-...........
-data: {"id":"cmpl-cc5dc73819c640469f7c7c7424fe57e6","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":" all","stop_reason":null}],"created":1742270725,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-cc5dc73819c640469f7c7c7424fe57e6","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":" tasks","stop_reason":null}],"created":1742270725,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-cc5dc73819c640469f7c7c7424fe57e6","choices":[{"finish_reason":"length","index":0,"logprobs":null,"text":",","stop_reason":null}],"created":1742270725,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: [DONE]
-```
-
-If the output lines in the "choices.text" keys contain words (tokens) containing meaning, then the service is considered launched successfully.
-
-### 4. Validate the Frontend (UI)
-
-To access the UI, use the URL - http://${EXTERNAL_HOST_IP}:${CODEGEN_UI_SERVICE_PORT}
-A page should open when you click through to this address:
-
-![UI start page](../../../../assets/img/ui-starting-page.png)
-
-If a page of this type has opened, then we believe that the service is running and responding,
-and we can proceed to functional UI testing.
-
-Let's enter the task for the service in the "Enter prompt here" field.
-For example, "Write a Python code that returns the current time and date" and press Enter.
-After that, a page with the result of the task should open:
-
-![UI result page](../../../../assets/img/ui-result-page.png)
-
-If the result shown on the page is correct, then we consider the verification of the UI service to be successful.
-
-### 5. Stop application
-
-#### If you use vLLM
-
-```bash
-cd ~/codegen-install/GenAIExamples/CodeGen/docker_compose/amd/gpu/rocm
-docker compose -f compose_vllm.yaml down
-```
-
-#### If you use TGI
-
-```bash
-cd ~/codegen-install/GenAIExamples/CodeGen/docker_compose/amd/gpu/rocm
-docker compose -f compose.yaml down
+curl http://${HOST_IP}:${CODEGEN_BACKEND_SERVICE_PORT}/v1/codegen -H "Content-Type: application/json" -d '{
+  "messages": "Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception."
+  }'
 ```
--- a/CodeGen/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/CodeGen/docker_compose/amd/gpu/rocm/compose.yaml
@@ -1,5 +1,4 @@
 # Copyright (C) 2024 Intel Corporation
-# Copyright (c) 2024 Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: Apache-2.0

 services:
@@ -16,9 +15,9 @@ services:
      https_proxy: ${https_proxy}
      HUGGING_FACE_HUB_TOKEN: ${CODEGEN_HUGGINGFACEHUB_API_TOKEN}
      HUGGINGFACEHUB_API_TOKEN: ${CODEGEN_HUGGINGFACEHUB_API_TOKEN}
-      host_ip: ${HOST_IP}
+      host_ip: ${host_ip}
    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://${HOST_IP}:${CODEGEN_TGI_SERVICE_PORT:-8028}/health || exit 1"]
+      test: ["CMD-SHELL", "curl -f http://$host_ip:${CODEGEN_TGI_SERVICE_PORT:-8028}/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
@@ -47,11 +46,9 @@ services:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      LLM_ENDPOINT: ${CODEGEN_TGI_LLM_ENDPOINT}
+      LLM_ENDPOINT: "http://codegen-tgi-service"
      LLM_MODEL_ID: ${CODEGEN_LLM_MODEL_ID}
      HUGGINGFACEHUB_API_TOKEN: ${CODEGEN_HUGGINGFACEHUB_API_TOKEN}
-      HF_TOKEN: ${CODEGEN_HUGGINGFACEHUB_API_TOKEN}
-      LLM_COMPONENT_NAME: "OpeaTextGenService"
    restart: unless-stopped
  codegen-backend-server:
    image: ${REGISTRY:-opea}/codegen:${TAG:-latest}
@@ -65,8 +62,7 @@ services:
      https_proxy: ${https_proxy}
      http_proxy: ${http_proxy}
      MEGA_SERVICE_HOST_IP: ${CODEGEN_MEGA_SERVICE_HOST_IP}
-      LLM_SERVICE_HOST_IP: ${HOST_IP}
-      LLM_SERVICE_PORT: ${CODEGEN_LLM_SERVICE_PORT}
+      LLM_SERVICE_HOST_IP: "codegen-llm-server"
    ipc: host
    restart: always
  codegen-ui-server:
--- a/CodeGen/docker_compose/amd/gpu/rocm/compose_vllm.yaml
+++ b/CodeGen/docker_compose/amd/gpu/rocm/compose_vllm.yaml
@@ -1,94 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# Copyright (c) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-services:
-  codegen-vllm-service:
-    image: ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest}
-    container_name: codegen-vllm-service
-    ports:
-      - "${CODEGEN_VLLM_SERVICE_PORT:-8081}:8011"
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HUGGINGFACEHUB_API_TOKEN: ${CODEGEN_HUGGINGFACEHUB_API_TOKEN}
-      HF_TOKEN: ${CODEGEN_HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-      WILM_USE_TRITON_FLASH_ATTENTION: 0
-      PYTORCH_JIT: 0
-    healthcheck:
-      test: [ "CMD-SHELL", "curl -f http://${HOST_IP}:${CODEGEN_VLLM_SERVICE_PORT:-8028}/health || exit 1" ]
-      interval: 10s
-      timeout: 10s
-      retries: 100
-    volumes:
-      - "./data:/data"
-    shm_size: 20G
-    devices:
-      - /dev/kfd:/dev/kfd
-      - /dev/dri/:/dev/dri/
-    cap_add:
-      - SYS_PTRACE
-    group_add:
-      - video
-    security_opt:
-      - seccomp:unconfined
-      - apparmor=unconfined
-    command: "--model ${CODEGEN_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
-    ipc: host
-  codegen-llm-server:
-    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
-    container_name: codegen-llm-server
-    depends_on:
-      codegen-vllm-service:
-        condition: service_healthy
-    ports:
-      - "${CODEGEN_LLM_SERVICE_PORT:-9000}:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      LLM_ENDPOINT: ${CODEGEN_VLLM_ENDPOINT}
-      LLM_MODEL_ID: ${CODEGEN_LLM_MODEL_ID}
-      HUGGINGFACEHUB_API_TOKEN: ${CODEGEN_HUGGINGFACEHUB_API_TOKEN}
-      HF_TOKEN: ${CODEGEN_HUGGINGFACEHUB_API_TOKEN}
-      LLM_COMPONENT_NAME: "OpeaTextGenService"
-    restart: unless-stopped
-  codegen-backend-server:
-    image: ${REGISTRY:-opea}/codegen:${TAG:-latest}
-    container_name: codegen-backend-server
-    depends_on:
-      - codegen-llm-server
-    ports:
-      - "${CODEGEN_BACKEND_SERVICE_PORT:-7778}:7778"
-    environment:
-      no_proxy: ${no_proxy}
-      https_proxy: ${https_proxy}
-      http_proxy: ${http_proxy}
-      MEGA_SERVICE_HOST_IP: ${CODEGEN_MEGA_SERVICE_HOST_IP}
-      LLM_SERVICE_HOST_IP: ${HOST_IP}
-      LLM_SERVICE_PORT: ${CODEGEN_LLM_SERVICE_PORT}
-    ipc: host
-    restart: always
-  codegen-ui-server:
-    image: ${REGISTRY:-opea}/codegen-ui:${TAG:-latest}
-    container_name: codegen-ui-server
-    depends_on:
-      - codegen-backend-server
-    ports:
-      - "${CODEGEN_UI_SERVICE_PORT:-5173}:5173"
-    environment:
-      no_proxy: ${no_proxy}
-      https_proxy: ${https_proxy}
-      http_proxy: ${http_proxy}
-      BASIC_URL: ${CODEGEN_BACKEND_SERVICE_URL}
-      BACKEND_SERVICE_ENDPOINT: ${CODEGEN_BACKEND_SERVICE_URL}
-    ipc: host
-    restart: always
-
-networks:
-  default:
-    driver: bridge
--- a/CodeGen/docker_compose/amd/gpu/rocm/set_env.sh
+++ b/CodeGen/docker_compose/amd/gpu/rocm/set_env.sh
@@ -1,18 +1,16 @@
 #!/usr/bin/env bash

 # Copyright (C) 2024 Intel Corporation
-# Copyright (c) 2024 Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: Apache-2.0

 ### The IP address or domain name of the server on which the application is running
-export HOST_IP=''
-export EXTERNAL_HOST_IP=''
+export HOST_IP=direct-supercomputer1.powerml.co

 ### The port of the TGI service. On this port, the TGI service will accept connections
 export CODEGEN_TGI_SERVICE_PORT=8028

 ### A token for accessing repositories with models
-export CODEGEN_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export CODEGEN_HUGGINGFACEHUB_API_TOKEN=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

 ### Model ID
 export CODEGEN_LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct"
@@ -30,7 +28,7 @@ export CODEGEN_MEGA_SERVICE_HOST_IP=${HOST_IP}
 export CODEGEN_BACKEND_SERVICE_PORT=18150

 ### The URL of CodeGen backend service, used by the frontend service
-export CODEGEN_BACKEND_SERVICE_URL="http://${EXTERNAL_HOST_IP}:${CODEGEN_BACKEND_SERVICE_PORT}/v1/codegen"
+export CODEGEN_BACKEND_SERVICE_URL="http://${HOST_IP}:${CODEGEN_BACKEND_SERVICE_PORT}/v1/codegen"

 ### The endpoint of the LLM service to which requests to this service will be sent
 export CODEGEN_LLM_SERVICE_HOST_IP=${HOST_IP}
--- a/CodeGen/docker_compose/amd/gpu/rocm/set_env_vllm.sh
+++ b/CodeGen/docker_compose/amd/gpu/rocm/set_env_vllm.sh
@@ -1,37 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (C) 2024 Intel Corporation
-# Copyright (c) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-### The IP address or domain name of the server on which the application is running
-export HOST_IP=''
-export EXTERNAL_HOST_IP=''
-
-### The port of the vLLM service. On this port, the TGI service will accept connections
-export CODEGEN_VLLM_SERVICE_PORT=8028
-export CODEGEN_VLLM_ENDPOINT="http://${HOST_IP}:${CODEGEN_VLLM_SERVICE_PORT}"
-
-### A token for accessing repositories with models
-export CODEGEN_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-
-### Model ID
-export CODEGEN_LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct"
-
-### The port of the LLM service. On this port, the LLM service will accept connections
-export CODEGEN_LLM_SERVICE_PORT=9000
-
-### The IP address or domain name of the server for CodeGen MegaService
-export CODEGEN_MEGA_SERVICE_HOST_IP=${HOST_IP}
-
-### The port for CodeGen backend service
-export CODEGEN_BACKEND_SERVICE_PORT=18150
-
-### The URL of CodeGen backend service, used by the frontend service
-export CODEGEN_BACKEND_SERVICE_URL="http://${EXTERNAL_HOST_IP}:${CODEGEN_BACKEND_SERVICE_PORT}/v1/codegen"
-
-### The endpoint of the LLM service to which requests to this service will be sent
-export CODEGEN_LLM_SERVICE_HOST_IP=${HOST_IP}
-
-### The CodeGen service UI port
-export CODEGEN_UI_SERVICE_PORT=18151
--- a/CodeGen/docker_image_build/build.yaml
+++ b/CodeGen/docker_image_build/build.yaml
@@ -29,11 +29,6 @@ services:
      dockerfile: comps/llms/src/text-generation/Dockerfile
    extends: codegen
    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
-  vllm-rocm:
-    build:
-      context: GenAIComps
-      dockerfile: comps/third_parties/vllm/src/Dockerfile.amd_gpu
-    image: ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest}
  vllm:
    build:
      context: vllm
--- a/CodeGen/tests/test_compose_on_rocm.sh
+++ b/CodeGen/tests/test_compose_on_rocm.sh
@@ -34,7 +34,7 @@ function build_docker_images() {
    service_list="codegen codegen-ui llm-textgen"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
    docker images && sleep 1s
 }

@@ -51,7 +51,7 @@ function start_services() {
    export CODEGEN_BACKEND_SERVICE_PORT=7778
    export CODEGEN_BACKEND_SERVICE_URL="http://${ip_address}:${CODEGEN_BACKEND_SERVICE_PORT}/v1/codegen"
    export CODEGEN_UI_SERVICE_PORT=5173
-    export HOST_IP=${ip_address}
+    export host_ip=${ip_address}

    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env

@@ -100,15 +100,15 @@ function validate_services() {
 function validate_microservices() {
    # tgi for llm service
    validate_services \
-        "${ip_address}:${CODEGEN_TGI_SERVICE_PORT}/generate" \
+        "${ip_address}:8028/generate" \
        "generated_text" \
        "codegen-tgi-service" \
        "codegen-tgi-service" \
        '{"inputs":"def print_hello_world():","parameters":{"max_new_tokens":256, "do_sample": true}}'
-    sleep 10
+
    # llm microservice
    validate_services \
-        "${ip_address}:${CODEGEN_LLM_SERVICE_PORT}/v1/chat/completions" \
+        "${ip_address}:9000/v1/chat/completions" \
        "data: " \
        "codegen-llm-server" \
        "codegen-llm-server" \
--- a/CodeGen/tests/test_compose_vllm_on_rocm.sh
+++ b/CodeGen/tests/test_compose_vllm_on_rocm.sh
@@ -1,181 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# Copyright (c) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-export REGISTRY=${IMAGE_REPO}
-export TAG=${IMAGE_TAG}
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-
-function build_docker_images() {
-    opea_branch=${opea_branch:-"main"}
-    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
-    if [[ "${opea_branch}" != "main" ]]; then
-        cd $WORKPATH
-        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
-        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
-        find . -type f -name "Dockerfile*" | while read -r file; do
-            echo "Processing file: $file"
-            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
-        done
-    fi
-
-    cd $WORKPATH/docker_image_build
-    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-
-    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="vllm-rocm llm-textgen codegen codegen-ui"
-    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
-    docker images && sleep 1s
-}
-
-function start_services() {
-    cd $WORKPATH/docker_compose/amd/gpu/rocm/
-
-    export CODEGEN_LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct"
-    export CODEGEN_VLLM_SERVICE_PORT=8028
-    export CODEGEN_VLLM_ENDPOINT="http://${ip_address}:${CODEGEN_VLLM_SERVICE_PORT}"
-    export CODEGEN_LLM_SERVICE_PORT=9000
-    export CODEGEN_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export CODEGEN_MEGA_SERVICE_HOST_IP=${ip_address}
-    export CODEGEN_LLM_SERVICE_HOST_IP=${ip_address}
-    export CODEGEN_BACKEND_SERVICE_PORT=7778
-    export CODEGEN_BACKEND_SERVICE_URL="http://${ip_address}:${CODEGEN_BACKEND_SERVICE_PORT}/v1/codegen"
-    export CODEGEN_UI_SERVICE_PORT=5173
-    export HOST_IP=${ip_address}
-
-    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
-
-    # Start Docker Containers
-    docker compose -f compose_vllm.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
-
-    n=0
-    until [[ "$n" -ge 500 ]]; do
-        docker logs codegen-vllm-service >& "${LOG_PATH}"/codegen-vllm-service_start.log
-        if grep -q "Application startup complete" "${LOG_PATH}"/codegen-vllm-service_start.log; then
-            break
-        fi
-        sleep 20s
-        n=$((n+1))
-    done
-}
-
-function validate_services() {
-    local URL="$1"
-    local EXPECTED_RESULT="$2"
-    local SERVICE_NAME="$3"
-    local DOCKER_NAME="$4"
-    local INPUT_DATA="$5"
-
-    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-    if [ "$HTTP_STATUS" -eq 200 ]; then
-        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
-        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
-        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
-            echo "[ $SERVICE_NAME ] Content is as expected."
-        else
-            echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
-            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-            exit 1
-        fi
-    else
-        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-        exit 1
-    fi
-    sleep 5s
-}
-
-function validate_microservices() {
-    # vLLM for llm service
-    validate_services \
-        "${ip_address}:${CODEGEN_VLLM_SERVICE_PORT}/v1/chat/completions" \
-        "content" \
-        "codegen-vllm-service" \
-        "codegen-vllm-service" \
-        '{"model": "Qwen/Qwen2.5-Coder-7B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
-    sleep 10
-    # llm microservice
-    validate_services \
-        "${ip_address}:${CODEGEN_LLM_SERVICE_PORT}/v1/chat/completions" \
-        "data: " \
-        "codegen-llm-server" \
-        "codegen-llm-server" \
-        '{"query":"def print_hello_world():"}'
-
-}
-
-function validate_megaservice() {
-    # Curl the Mega Service
-    validate_services \
-        "${ip_address}:${CODEGEN_BACKEND_SERVICE_PORT}/v1/codegen" \
-        "print" \
-        "codegen-backend-server" \
-        "codegen-backend-server" \
-        '{"messages": "def print_hello_world():"}'
-
-}
-
-function validate_frontend() {
-    cd $WORKPATH/ui/svelte
-    local conda_env_name="OPEA_e2e"
-    export PATH=${HOME}/miniconda3/bin/:$PATH
-    if conda info --envs | grep -q "$conda_env_name"; then
-        echo "$conda_env_name exist!"
-    else
-        conda create -n ${conda_env_name} python=3.12 -y
-    fi
-    source activate ${conda_env_name}
-
-    sed -i "s/localhost/$ip_address/g" playwright.config.ts
-
-    conda install -c conda-forge nodejs=22.6.0 -y
-    npm install && npm ci && npx playwright install --with-deps
-    node -v && npm -v && pip list
-
-    exit_status=0
-    npx playwright test || exit_status=$?
-
-    if [ $exit_status -ne 0 ]; then
-        echo "[TEST INFO]: ---------frontend test failed---------"
-        exit $exit_status
-    else
-        echo "[TEST INFO]: ---------frontend test passed---------"
-    fi
-}
-
-
-function stop_docker() {
-    echo "OPENAI_API_KEY - ${OPENAI_API_KEY}"
-    cd $WORKPATH/docker_compose/amd/gpu/rocm/
-    docker compose -f compose_vllm.yaml stop && docker compose -f compose_vllm.yaml rm -f
-}
-
-function main() {
-
-    stop_docker
-    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    start_services
-
-    validate_microservices
-    validate_megaservice
-    validate_frontend
-
-    stop_docker
-    echo y | docker system prune
-    cd $WORKPATH
-
-}
-
-main
--- a/CodeTrans/Dockerfile
+++ b/CodeTrans/Dockerfile
@@ -1,8 +1,48 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-ARG BASE_TAG=latest
-FROM opea/comps-base:$BASE_TAG
+# Stage 1: base setup used by other stages
+FROM python:3.11-slim AS base
+
+# get security updates
+RUN apt-get update && apt-get upgrade -y && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+ENV HOME=/home/user
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p $HOME && \
+    chown -R user $HOME
+
+WORKDIR $HOME
+
+
+# Stage 2: latest GenAIComps sources
+FROM base AS git
+
+RUN apt-get update && apt-get install -y --no-install-recommends git
+RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
+
+
+# Stage 3: common layer shared by services using GenAIComps
+FROM base AS comps-base
+
+# copy just relevant parts
+COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
+COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
+
+WORKDIR $HOME/GenAIComps
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
+WORKDIR $HOME
+
+ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
+
+USER user
+
+
+# Stage 4: unique part
+FROM comps-base

 COPY ./code_translation.py $HOME/code_translation.py

--- a/CodeTrans/assets/img/ui-result-page.png
+++ b/CodeTrans/assets/img/ui-result-page.png
--- a/CodeTrans/assets/img/ui-starting-page.png
+++ b/CodeTrans/assets/img/ui-starting-page.png
--- a/CodeTrans/docker_compose/amd/gpu/rocm/README.md
+++ b/CodeTrans/docker_compose/amd/gpu/rocm/README.md
@@ -1,119 +1,42 @@
-# Build and Deploy CodeTrans Application on AMD GPU (ROCm)
+# Build and deploy CodeTrans Application on AMD GPU (ROCm)

-## Build Docker Images
+## Build images

-### 1. Build Docker Image
+### Build the LLM Docker Image

- #### Create application install directory and go to it:
+```bash
+### Cloning repo
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps

-  ```bash
-  mkdir ~/codetrans-install && cd codetrans-install
-  ```
+### Build Docker image
+docker build -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
+```

- #### Clone the repository GenAIExamples (the default repository branch "main" is used here):
+### Build the MegaService Docker Image

-  ```bash
-  git clone https://github.com/opea-project/GenAIExamples.git
-  ```
+```bash
+### Cloning repo
+git clone https://github.com/opea-project/GenAIExamples
+cd GenAIExamples/CodeTrans

-  If you need to use a specific branch/tag of the GenAIExamples repository, then (v1.3 replace with its own value):
+### Build Docker image
+docker build -t opea/codetrans:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```

-  ```bash
-  git clone https://github.com/opea-project/GenAIExamples.git && cd GenAIExamples && git checkout v1.3
-  ```
+### Build the UI Docker Image

-  We remind you that when using a specific version of the code, you need to use the README from this version:
+```bash
+cd GenAIExamples/CodeTrans/ui
+### Build UI Docker image
+docker build -t opea/codetrans-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
+```

- #### Go to build directory:
+## Deploy CodeTrans Application

-  ```bash
-  cd ~/codetrans-install/GenAIExamples/CodeTrans/docker_image_build
-  ```
+### Features of Docker compose for AMD GPUs

- Cleaning up the GenAIComps repository if it was previously cloned in this directory.
-  This is necessary if the build was performed earlier and the GenAIComps folder exists and is not empty:
-
-  ```bash
-  echo Y | rm -R GenAIComps
-  ```
-
- #### Clone the repository GenAIComps (the default repository branch "main" is used here):
-
-  ```bash
-  git clone https://github.com/opea-project/GenAIComps.git
-  ```
-
-  If you use a specific tag of the GenAIExamples repository,
-  then you should also use the corresponding tag for GenAIComps. (v1.3 replace with its own value):
-
-  ```bash
-  git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout v1.3
-  ```
-
-  We remind you that when using a specific version of the code, you need to use the README from this version.
-
- #### Setting the list of images for the build (from the build file.yaml)
-
-  If you want to deploy a vLLM-based or TGI-based application, then the set of services is installed as follows:
-
-  #### vLLM-based application
-
-  ```bash
-  service_list="vllm-rocm llm-textgen codetrans codetrans-ui nginx"
-  ```
-
-  #### TGI-based application
-
-  ```bash
-  service_list="llm-textgen codetrans codetrans-ui nginx"
-  ```
-
- #### Optional. Pull TGI Docker Image (Do this if you want to use TGI)
-
-  ```bash
-  docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
-  ```
-
- #### Build Docker Images
-
-  ```bash
-  docker compose -f build.yaml build ${service_list} --no-cache
-  ```
-
-  After the build, we check the list of images with the command:
-
-  ```bash
-  docker image ls
-  ```
-
-  The list of images should include:
-
-  ##### vLLM-based application:
-
-  - opea/vllm-rocm:latest
-  - opea/llm-textgen:latest
-  - opea/codetrans:latest
-  - opea/codetrans-ui:latest
-  - opea/nginx:latest
-
-  ##### TGI-based application:
-
-  - ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
-  - opea/llm-textgen:latest
-  - opea/codetrans:latest
-  - opea/codetrans-ui:latest
-  - opea/nginx:latest
-
---
-
-## Deploy the CodeTrans Application
-
-### Docker Compose Configuration for AMD GPUs
-
-To enable GPU support for AMD GPUs, the following configuration is added to the Docker Compose file:
-
- compose_vllm.yaml - for vLLM-based application
- compose.yaml - for TGI-based
+1. Added forwarding of GPU devices to the container TGI service with instructions:

 ```yaml
 shm_size: 1g
@@ -128,7 +51,9 @@ security_opt:
  - seccomp:unconfined
 ```

-This configuration forwards all available GPUs to the container. To use a specific GPU, specify its `cardN` and `renderN` device IDs. For example:
+In this case, all GPUs are thrown. To reset a specific GPU, you need to use specific device names cardN and renderN.
+
+For example:

 ```yaml
 shm_size: 1g
@@ -144,274 +69,53 @@ security_opt:
  - seccomp:unconfined
 ```

-**How to Identify GPU Device IDs:**
-Use AMD GPU driver utilities to determine the correct `cardN` and `renderN` IDs for your GPU.
+To find out which GPU device IDs cardN and renderN correspond to the same GPU, use the GPU driver utility

-### Set deploy environment variables
-
-#### Setting variables in the operating system environment:
-
-##### Set variable HUGGINGFACEHUB_API_TOKEN:
+### Go to the directory with the Docker compose file

 ```bash
-### Replace the string 'your_huggingfacehub_token' with your HuggingFacehub repository access token.
-export HUGGINGFACEHUB_API_TOKEN='your_huggingfacehub_token'
+cd GenAIExamples/CodeTrans/docker_compose/amd/gpu/rocm
 ```

-#### Set variables value in set_env\*\*\*\*.sh file:
+### Set environments

-Go to Docker Compose directory:
-
-```bash
-cd ~/codetrans-install/GenAIExamples/CodeTrans/docker_compose/amd/gpu/rocm
-```
-
-The example uses the Nano text editor. You can use any convenient text editor:
-
-#### If you use vLLM
-
-```bash
-nano set_env_vllm.sh
-```
-
-#### If you use TGI
-
-```bash
-nano set_env.sh
-```
-
-If you are in a proxy environment, also set the proxy-related environment variables:
-
-```bash
-export http_proxy="Your_HTTP_Proxy"
-export https_proxy="Your_HTTPs_Proxy"
-```
-
-Set the values of the variables:
-
- **HOST_IP, HOST_IP_EXTERNAL** - These variables are used to configure the name/address of the service in the operating system environment for the application services to interact with each other and with the outside world.
-
-  If your server uses only an internal address and is not accessible from the Internet, then the values for these two variables will be the same and the value will be equal to the server's internal name/address.
-
-  If your server uses only an external, Internet-accessible address, then the values for these two variables will be the same and the value will be equal to the server's external name/address.
-
-  If your server is located on an internal network, has an internal address, but is accessible from the Internet via a proxy/firewall/load balancer, then the HOST_IP variable will have a value equal to the internal name/address of the server, and the EXTERNAL_HOST_IP variable will have a value equal to the external name/address of the proxy/firewall/load balancer behind which the server is located.
-
-  We set these values in the file set_env\*\*\*\*.sh
-
- **Variables with names like "**\*\*\*\*\*\*\_PORT"\*\* - These variables set the IP port numbers for establishing network connections to the application services.
-  The values shown in the file set_env.sh or set_env_vllm they are the values used for the development and testing of the application, as well as configured for the environment in which the development is performed. These values must be configured in accordance with the rules of network access to your environment's server, and must not overlap with the IP ports of other applications that are already in use.
-
-#### Set variables with script set_env\*\*\*\*.sh
-
-#### If you use vLLM
-
-```bash
-. set_env_vllm.sh
-```
-
-#### If you use TGI
+In the file "GenAIExamples/CodeTrans/docker_compose/amd/gpu/rocm/set_env.sh " it is necessary to set the required values. Parameter assignments are specified in the comments for each variable setting command

 ```bash
+chmod +x set_env.sh
 . set_env.sh
 ```

-### Start the services:
+### Run services

-#### If you use vLLM
-
-```bash
-docker compose -f compose_vllm.yaml up -d
+```
+docker compose up -d
 ```

-#### If you use TGI
+# Validate the MicroServices and MegaService
+
+## Validate TGI service

 ```bash
-docker compose -f compose.yaml up -d
-```
-
-All containers should be running and should not restart:
-
-##### If you use vLLM:
-
- codetrans-vllm-service
- codetrans-llm-server
- codetrans-backend-server
- codetrans-ui-server
- codetrans-nginx-server
-
-##### If you use TGI:
-
- codetrans-tgi-service
- codetrans-llm-server
- codetrans-backend-server
- codetrans-ui-server
- codetrans-nginx-server
-
---
-
-## Validate the Services
-
-### 1. Validate the vLLM/TGI Service
-
-#### If you use vLLM:
-
-```bash
-DATA='{"model": "Qwen/Qwen2.5-Coder-7B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}' \
-
-curl http://${HOST_IP}:${CODETRANS_VLLM_SERVICE_PORT}/v1/chat/completions \
+curl http://${HOST_IP}:${CODETRANS_TGI_SERVICE_PORT}/generate \
  -X POST \
-  -d "$DATA" \
+  -d '{"inputs":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:","parameters":{"max_new_tokens":17, "do_sample": true}}' \
  -H 'Content-Type: application/json'
 ```

-Checking the response from the service. The response should be similar to JSON:
-
-```json
-{
-  "id": "chatcmpl-9080fdc16f0f4f43a4e1b0de1e29af1f",
-  "object": "chat.completion",
-  "created": 1742286287,
-  "model": "Qwen/Qwen2.5-Coder-7B-Instruct",
-  "choices": [
-    {
-      "index": 0,
-      "message": {
-        "role": "assistant",
-        "content": "Deep Learning is a subset of Machine Learning that encompasses a wide range of algorithms and models",
-        "tool_calls": []
-      },
-      "logprobs": null,
-      "finish_reason": "length",
-      "stop_reason": null
-    }
-  ],
-  "usage": { "prompt_tokens": 34, "total_tokens": 51, "completion_tokens": 17, "prompt_tokens_details": null },
-  "prompt_logprobs": null
-}
-```
-
-If the service response has a meaningful response in the value of the "choices.message.content" key,
-then we consider the vLLM service to be successfully launched
-
-#### If you use TGI:
+## Validate LLM service

 ```bash
-DATA='{"inputs":"Implement a high-level API for a TODO list application. '\
-'The API takes as input an operation request and updates the TODO list in place. '\
-'If the request is invalid, raise an exception.",'\
-'"parameters":{"max_new_tokens":256,"do_sample": true}}'
-
-curl http://${HOST_IP}:${CODEGEN_TGI_SERVICE_PORT}/generate \
-  -X POST \
-  -d "$DATA" \
-  -H 'Content-Type: application/json'
-```
-
-Checking the response from the service. The response should be similar to JSON:
-
-````json
-{
-  "generated_text": " The supported operations are \"add_task\", \"complete_task\", and \"remove_task\". Each operation can be defined with a corresponding function in the API.\n\nAdd your API in the following format:\n\n```\nTODO App API\n\nsupported operations:\n\noperation name           description\n-----------------------  ------------------------------------------------\n<operation_name>         <operation description>\n```\n\nUse type hints for function parameters and return values. Specify a text description of the API's supported operations.\n\nUse the following code snippet as a starting point for your high-level API function:\n\n```\nclass TodoAPI:\n    def __init__(self, tasks: List[str]):\n        self.tasks = tasks  # List of tasks to manage\n\n    def add_task(self, task: str) -> None:\n        self.tasks.append(task)\n\n    def complete_task(self, task: str) -> None:\n        self.tasks = [t for t in self.tasks if t != task]\n\n    def remove_task(self, task: str) -> None:\n        self.tasks = [t for t in self.tasks if t != task]\n\n    def handle_request(self, request: Dict[str, str]) -> None:\n        operation = request.get('operation')\n        if operation == 'add_task':\n            self.add_task(request.get('task'))\n        elif"
-}
-````
-
-If the service response has a meaningful response in the value of the "generated_text" key,
-then we consider the TGI service to be successfully launched
-
-### 2. Validate the LLM Service
-
-```bash
-DATA='{"query":"    ### System: Please translate the following Python codes into  Java codes.    '\
-'### Original codes:    '\'''\'''\''Python    \nprint(\"Hello, World!\");\n    '\'''\'''\''    '\
-'### Translated codes:"}'
-
 curl http://${HOST_IP}:${CODETRANS_LLM_SERVICE_PORT}/v1/chat/completions \
  -X POST \
-  -d "$DATA" \
+  -d '{"query":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:"}' \
  -H 'Content-Type: application/json'
 ```

-Checking the response from the service. The response should be similar to JSON:
-
-```textmate
-data: {"id":"cmpl-c2acd8c385be4f59bae01d1ec31ca617","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":"   ","stop_reason":null}],"created":1742287740,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-c2acd8c385be4f59bae01d1ec31ca617","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":" ###","stop_reason":null}],"created":1742287740,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-c2acd8c385be4f59bae01d1ec31ca617","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":" Java","stop_reason":null}],"created":1742287740,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-c2acd8c385be4f59bae01d1ec31ca617","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":"\n","stop_reason":null}],"created":1742287740,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-c2acd8c385be4f59bae01d1ec31ca617","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":"","stop_reason":null}],"created":1742287740,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-c2acd8c385be4f59bae01d1ec31ca617","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":".out","stop_reason":null}],"created":1742287740,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-c2acd8c385be4f59bae01d1ec31ca617","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":".println","stop_reason":null}],"created":1742287740,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-c2acd8c385be4f59bae01d1ec31ca617","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":"(\"","stop_reason":null}],"created":1742287740,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-c2acd8c385be4f59bae01d1ec31ca617","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":"Hello","stop_reason":null}],"created":1742287740,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-c2acd8c385be4f59bae01d1ec31ca617","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":",","stop_reason":null}],"created":1742287740,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-c2acd8c385be4f59bae01d1ec31ca617","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":" World","stop_reason":null}],"created":1742287740,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-c2acd8c385be4f59bae01d1ec31ca617","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":"!\");","stop_reason":null}],"created":1742287740,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-c2acd8c385be4f59bae01d1ec31ca617","choices":[{"finish_reason":"stop","index":0,"logprobs":null,"text":"","stop_reason":null}],"created":1742287740,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: [DONE]
-```
-
-If the service response has a meaningful response in the value of the "choices.text" key,
-then we consider the vLLM service to be successfully launched
-
-### 3. Validate the MegaService
+## Validate MegaService

 ```bash
-DATA='{"language_from": "Python","language_to": "Java","source_code": '\
-'"print(\"Hello, World!\");\n}"}'
-
-curl http://${HOST_IP}:${CODETRANS_BACKEND_SERVICE_PORT}/v1/codetrans \
+curl http://${HOST_IP}:${CODEGEN_BACKEND_SERVICE_PORT}/v1/codetrans \
  -H "Content-Type: application/json" \
-  -d "$DATA"
-```
-
-Checking the response from the service. The response should be similar to text:
-
-```textmate
-data: {"id":"cmpl-b63a51caccd34687b26614eb46c0abc6","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":"Java","stop_reason":null}],"created":1742287989,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-..............
-data: {"id":"cmpl-b63a51caccd34687b26614eb46c0abc6","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":"\n","stop_reason":null}],"created":1742287989,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-b63a51caccd34687b26614eb46c0abc6","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":"        ","stop_reason":null}],"created":1742287989,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-b63a51caccd34687b26614eb46c0abc6","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":" public","stop_reason":null}],"created":1742287989,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-b63a51caccd34687b26614eb46c0abc6","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":" class","stop_reason":null}],"created":1742287989,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-b63a51caccd34687b26614eb46c0abc6","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":" HelloWorld","stop_reason":null}],"created":1742287989,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-b63a51caccd34687b26614eb46c0abc6","choices":[{"finish_reason":null,"index":0,"logprobs":null,"text":" {\n","stop_reason":null}],"created":1742287989,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: {"id":"cmpl-b63a51caccd34687b26614eb46c0abc6","choices":[{"finish_reason":"length","index":0,"logprobs":null,"text":"            ","stop_reason":null}],"created":1742287989,"model":"Qwen/Qwen2.5-Coder-7B-Instruct","object":"text_completion","system_fingerprint":null,"usage":null}
-data: [DONE]
-```
-
-If the output lines in the "choices.text" keys contain words (tokens) containing meaning, then the service is considered launched successfully.
-
-### 4. Validate the Frontend (UI)
-
-To access the UI, use the URL - http://${EXTERNAL_HOST_IP}:${CODETRANS_NGINX_PORT}
-A page should open when you click through to this address:
-
-![UI start page](../../../../assets/img/ui-starting-page.png)
-
-If a page of this type has opened, then we believe that the service is running and responding,
-and we can proceed to functional UI testing.
-
-For example, let's choose the translation of code from Python to Java.
-Enter the code 'print("hello world!")' in the Python field.
-After that, a page with the result of the task should open:
-
-![UI result page](../../../../assets/img/ui-result-page.png)
-
-If the result shown on the page is correct, then we consider the verification of the UI service to be successful.
-
-### 5. Stop application
-
-#### If you use vLLM
-
-```bash
-cd ~/codetrans-install/GenAIExamples/CodeTrans/docker_compose/amd/gpu/rocm
-docker compose -f compose_vllm.yaml down
-```
-
-#### If you use TGI
-
-```bash
-cd ~/codetrans-install/GenAIExamples/CodeTrans/docker_compose/amd/gpu/rocm
-docker compose -f compose.yaml down
+  -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}"}'
 ```
--- a/CodeTrans/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/CodeTrans/docker_compose/amd/gpu/rocm/compose.yaml
@@ -1,5 +1,4 @@
 # Copyright (C) 2024 Intel Corporation
-# Copyright (c) 2024 Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: Apache-2.0

 services:
@@ -20,7 +19,7 @@ services:
      HUGGINGFACEHUB_API_TOKEN: ${CODEGEN_HUGGINGFACEHUB_API_TOKEN}
      host_ip: ${host_ip}
    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://${HOST_IP}:${CODETRANS_TGI_SERVICE_PORT}/health || exit 1"]
+      test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
--- a/CodeTrans/docker_compose/amd/gpu/rocm/compose_vllm.yaml
+++ b/CodeTrans/docker_compose/amd/gpu/rocm/compose_vllm.yaml
@@ -1,113 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# Copyright (c) 2024 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-services:
-  codetrans-vllm-service:
-    image: ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest}
-    container_name: codetrans-vllm-service
-    ports:
-      - "${CODETRANS_VLLM_SERVICE_PORT:-8081}:8011"
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HUGGINGFACEHUB_API_TOKEN: ${CODETRANS_HUGGINGFACEHUB_API_TOKEN}
-      HF_TOKEN: ${CODETRANS_HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-      WILM_USE_TRITON_FLASH_ATTENTION: 0
-      PYTORCH_JIT: 0
-    healthcheck:
-      test: [ "CMD-SHELL", "curl -f http://${HOST_IP}:${CODETRANS_VLLM_SERVICE_PORT:-8028}/health || exit 1" ]
-      interval: 10s
-      timeout: 10s
-      retries: 100
-    volumes:
-      - "./data:/data"
-    shm_size: 20G
-    devices:
-      - /dev/kfd:/dev/kfd
-      - /dev/dri/:/dev/dri/
-    cap_add:
-      - SYS_PTRACE
-    group_add:
-      - video
-    security_opt:
-      - seccomp:unconfined
-      - apparmor=unconfined
-    command: "--model ${CODETRANS_LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 4 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
-    ipc: host
-  codetrans-llm-server:
-    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
-    container_name: codetrans-llm-server
-    depends_on:
-      codetrans-vllm-service:
-        condition: service_healthy
-    ports:
-      - "${CODETRANS_LLM_SERVICE_PORT:-9000}:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      LLM_ENDPOINT: ${CODETRANS_LLM_ENDPOINT}
-      LLM_MODEL_ID: ${CODETRANS_LLM_MODEL_ID}
-      HUGGINGFACEHUB_API_TOKEN: ${CODETRANS_HUGGINGFACEHUB_API_TOKEN}
-      HF_TOKEN: ${CODETRANS_HUGGINGFACEHUB_API_TOKEN}
-      LLM_COMPONENT_NAME: "OpeaTextGenService"
-    restart: unless-stopped
-  codetrans-backend-server:
-    image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
-    container_name: codetrans-backend-server
-    depends_on:
-      - codetrans-llm-server
-    ports:
-      - "${CODETRANS_BACKEND_SERVICE_PORT:-7777}:7777"
-    environment:
-      no_proxy: ${no_proxy}
-      https_proxy: ${https_proxy}
-      http_proxy: ${http_proxy}
-      MEGA_SERVICE_HOST_IP: ${HOST_IP}
-      LLM_SERVICE_HOST_IP: ${HOST_IP}
-      LLM_SERVICE_PORT: ${CODETRANS_LLM_SERVICE_PORT}
-    ipc: host
-    restart: always
-  codetrans-ui-server:
-    image: ${REGISTRY:-opea}/codetrans-ui:${TAG:-latest}
-    container_name: codetrans-ui-server
-    depends_on:
-      - codetrans-backend-server
-    ports:
-      - "${CODETRANS_FRONTEND_SERVICE_PORT:-5173}:5173"
-    environment:
-      no_proxy: ${no_proxy}
-      https_proxy: ${https_proxy}
-      http_proxy: ${http_proxy}
-      BASE_URL: ${CODETRANS_BACKEND_SERVICE_URL}
-      BASIC_URL: ${CODETRANS_BACKEND_SERVICE_URL}
-    ipc: host
-    restart: always
-  codetrans-nginx-server:
-    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
-    container_name: codetrans-nginx-server
-    depends_on:
-      - codetrans-backend-server
-      - codetrans-ui-server
-    ports:
-      - "${CODETRANS_NGINX_PORT:-80}:80"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - FRONTEND_SERVICE_IP=${CODETRANS_FRONTEND_SERVICE_IP}
-      - FRONTEND_SERVICE_PORT=${CODETRANS_FRONTEND_SERVICE_PORT}
-      - BACKEND_SERVICE_NAME=${CODETRANS_BACKEND_SERVICE_NAME}
-      - BACKEND_SERVICE_IP=${CODETRANS_BACKEND_SERVICE_IP}
-      - BACKEND_SERVICE_PORT=${CODETRANS_BACKEND_SERVICE_PORT}
-    ipc: host
-    restart: always
-
-networks:
-  default:
-    driver: bridge
--- a/CodeTrans/docker_compose/amd/gpu/rocm/set_env.sh
+++ b/CodeTrans/docker_compose/amd/gpu/rocm/set_env.sh
@@ -1,15 +1,10 @@
 #!/usr/bin/env bash

-# Copyright (c) 2024 Advanced Micro Devices, Inc.
+# Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

 ### The IP address or domain name of the server on which the application is running
-# If your server is located behind a firewall or proxy, you will need to specify its external address,
-# which can be used to connect to the server from the Internet. It must be specified in the EXTERNAL_HOST_IP variable.
-# If the server is used only on the internal network or has a direct external address,
-# specify it in HOST_IP and in EXTERNAL_HOST_IP.
-export HOST_IP=''
-export EXTERNAL_HOST_IP=''
+export HOST_IP=direct-supercomputer1.powerml.co

 ### Model ID
 export CODETRANS_LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct"
@@ -21,7 +16,7 @@ export CODETRANS_TGI_SERVICE_PORT=18156
 export CODETRANS_TGI_LLM_ENDPOINT="http://${HOST_IP}:${CODETRANS_TGI_SERVICE_PORT}"

 ### A token for accessing repositories with models
-export CODETRANS_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export CODETRANS_HUGGINGFACEHUB_API_TOKEN=''

 ### The port of the LLM service. On this port, the LLM service will accept connections
 export CODETRANS_LLM_SERVICE_PORT=18157
@@ -33,7 +28,7 @@ export CODETRANS_MEGA_SERVICE_HOST_IP=${HOST_IP}
 export CODETRANS_LLM_SERVICE_HOST_IP=${HOST_IP}

 ### The ip address of the host on which the container with the frontend service is running
-export CODETRANS_FRONTEND_SERVICE_IP=${HOST_IP}
+export CODETRANS_FRONTEND_SERVICE_IP=192.165.1.21

 ### The port of the frontend service
 export CODETRANS_FRONTEND_SERVICE_PORT=18155
@@ -42,7 +37,7 @@ export CODETRANS_FRONTEND_SERVICE_PORT=18155
 export CODETRANS_BACKEND_SERVICE_NAME=codetrans

 ### The ip address of the host on which the container with the backend service is running
-export CODETRANS_BACKEND_SERVICE_IP=${HOST_IP}
+export CODETRANS_BACKEND_SERVICE_IP=192.165.1.21

 ### The port of the backend service
 export CODETRANS_BACKEND_SERVICE_PORT=18154
@@ -51,4 +46,4 @@ export CODETRANS_BACKEND_SERVICE_PORT=18154
 export CODETRANS_NGINX_PORT=18153

 ### Endpoint of the backend service
-export CODETRANS_BACKEND_SERVICE_URL="http://${EXTERNAL_HOST_IP}:${CODETRANS_BACKEND_SERVICE_PORT}/v1/codetrans"
+export CODETRANS_BACKEND_SERVICE_URL="http://${HOST_IP}:${CODETRANS_BACKEND_SERVICE_PORT}/v1/codetrans"
--- a/CodeTrans/docker_compose/amd/gpu/rocm/set_env_vllm.sh
+++ b/CodeTrans/docker_compose/amd/gpu/rocm/set_env_vllm.sh
@@ -1,54 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) 2025 Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: Apache-2.0
-
-### The IP address or domain name of the server on which the application is running
-# If your server is located behind a firewall or proxy, you will need to specify its external address,
-# which can be used to connect to the server from the Internet. It must be specified in the EXTERNAL_HOST_IP variable.
-# If the server is used only on the internal network or has a direct external address,
-# specify it in HOST_IP and in EXTERNAL_HOST_IP.
-export HOST_IP=''
-export EXTERNAL_HOST_IP=''
-
-### Model ID
-export CODETRANS_LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct"
-
-### The port of the TGI service. On this port, the TGI service will accept connections
-export CODETRANS_VLLM_SERVICE_PORT=18156
-
-### The endpoint of the TGI service to which requests to this service will be sent (formed from previously set variables)
-export CODETRANS_LLM_ENDPOINT="http://${HOST_IP}:${CODETRANS_VLLM_SERVICE_PORT}"
-
-### A token for accessing repositories with models
-export CODETRANS_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-
-### The port of the LLM service. On this port, the LLM service will accept connections
-export CODETRANS_LLM_SERVICE_PORT=18157
-
-### The IP address or domain name of the server for CodeTrans MegaService
-export CODETRANS_MEGA_SERVICE_HOST_IP=${HOST_IP}
-
-### The endpoint of the LLM service to which requests to this service will be sent
-export CODETRANS_LLM_SERVICE_HOST_IP=${HOST_IP}
-
-### The ip address of the host on which the container with the frontend service is running
-export CODETRANS_FRONTEND_SERVICE_IP=${HOST_IP}
-
-### The port of the frontend service
-export CODETRANS_FRONTEND_SERVICE_PORT=18155
-
-### Name of GenAI service for route requests to application
-export CODETRANS_BACKEND_SERVICE_NAME=codetrans
-
-### The ip address of the host on which the container with the backend service is running
-export CODETRANS_BACKEND_SERVICE_IP=${HOST_IP}
-
-### The port of the backend service
-export CODETRANS_BACKEND_SERVICE_PORT=18154
-
-### The port of the Nginx reverse proxy for application
-export CODETRANS_NGINX_PORT=18153
-
-### Endpoint of the backend service
-export CODETRANS_BACKEND_SERVICE_URL="http://${EXTERNAL_HOST_IP}:${CODETRANS_BACKEND_SERVICE_PORT}/v1/codetrans"
--- a/CodeTrans/docker_image_build/build.yaml
+++ b/CodeTrans/docker_image_build/build.yaml
@@ -41,8 +41,3 @@ services:
      dockerfile: comps/third_parties/nginx/src/Dockerfile
    extends: codetrans
    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
-  vllm-rocm:
-    build:
-      context: GenAIComps
-      dockerfile: comps/third_parties/vllm/src/Dockerfile.amd_gpu
-    image: ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest}
--- a/CodeTrans/tests/test_compose_on_rocm.sh
+++ b/CodeTrans/tests/test_compose_on_rocm.sh
@@ -57,7 +57,7 @@ function start_services() {
    export CODETRANS_BACKEND_SERVICE_PORT=7777
    export CODETRANS_NGINX_PORT=8088
    export CODETRANS_BACKEND_SERVICE_URL="http://${ip_address}:${CODETRANS_BACKEND_SERVICE_PORT}/v1/codetrans"
-    export HOST_IP=${ip_address}
+    export host_ip=${ip_address}

    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env

@@ -111,7 +111,7 @@ function validate_microservices() {
        "codetrans-tgi-service" \
        "codetrans-tgi-service" \
        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-    sleep 10
+
    # llm microservice
    validate_services \
        "${ip_address}:${CODETRANS_LLM_SERVICE_PORT}/v1/chat/completions" \
--- a/CodeTrans/tests/test_compose_vllm_on_rocm.sh
+++ b/CodeTrans/tests/test_compose_vllm_on_rocm.sh
@@ -1,194 +0,0 @@
-#!/bin/bash
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-export REGISTRY=${IMAGE_REPO}
-export TAG=${IMAGE_TAG}
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-
-function build_docker_images() {
-    opea_branch=${opea_branch:-"main"}
-    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
-    if [[ "${opea_branch}" != "main" ]]; then
-        cd $WORKPATH
-        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
-        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
-        find . -type f -name "Dockerfile*" | while read -r file; do
-            echo "Processing file: $file"
-            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
-        done
-    fi
-
-    cd $WORKPATH/docker_image_build
-    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-
-    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="vllm-rocm llm-textgen codetrans codetrans-ui nginx"
-    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
-    docker images && sleep 1s
-}
-
-function start_services() {
-    cd $WORKPATH/docker_compose/amd/gpu/rocm/
-    export http_proxy=${http_proxy}
-    export https_proxy=${http_proxy}
-    export HOST_IP=${ip_address}
-    export CODETRANS_VLLM_SERVICE_PORT=8008
-    export CODETRANS_LLM_SERVICE_PORT=9000
-    export CODETRANS_LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct"
-    export CODETRANS_LLM_ENDPOINT="http://${ip_address}:${CODETRANS_VLLM_SERVICE_PORT}"
-    export CODETRANS_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export CODETRANS_MEGA_SERVICE_HOST_IP=${ip_address}
-    export CODETRANS_LLM_SERVICE_HOST_IP=${ip_address}
-    export CODETRANS_FRONTEND_SERVICE_IP=${ip_address}
-    export CODETRANS_FRONTEND_SERVICE_PORT=5173
-    export CODETRANS_BACKEND_SERVICE_NAME=codetrans
-    export CODETRANS_BACKEND_SERVICE_IP=${ip_address}
-    export CODETRANS_BACKEND_SERVICE_PORT=7777
-    export CODETRANS_NGINX_PORT=8088
-    export CODETRANS_BACKEND_SERVICE_URL="http://${ip_address}:${CODETRANS_BACKEND_SERVICE_PORT}/v1/codetrans"
-    export HOST_IP=${ip_address}
-
-    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
-
-    # Start Docker Containers
-    docker compose -f compose_vllm.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
-
-    n=0
-    until [[ "$n" -ge 500 ]]; do
-        docker logs codetrans-vllm-service >& "${LOG_PATH}"/codetrans-vllm-service_start.log
-        if grep -q "Application startup complete" "${LOG_PATH}"/codetrans-vllm-service_start.log; then
-            break
-        fi
-        sleep 20s
-        n=$((n+1))
-    done
-}
-
-function validate_services() {
-    local URL="$1"
-    local EXPECTED_RESULT="$2"
-    local SERVICE_NAME="$3"
-    local DOCKER_NAME="$4"
-    local INPUT_DATA="$5"
-
-    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-    if [ "$HTTP_STATUS" -eq 200 ]; then
-        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
-        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
-        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
-            echo "[ $SERVICE_NAME ] Content is as expected."
-        else
-            echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
-            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-            exit 1
-        fi
-    else
-        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-        exit 1
-    fi
-    sleep 5s
-}
-
-function validate_microservices() {
-    # tgi for embedding service
-    # vLLM for llm service
-    validate_services \
-        "${ip_address}:${CODETRANS_VLLM_SERVICE_PORT}/v1/chat/completions" \
-        "content" \
-        "codetrans-vllm-service" \
-        "codetrans-vllm-service" \
-        '{"model": "Qwen/Qwen2.5-Coder-7B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
-    sleep 10
-    # llm microservice
-    validate_services \
-        "${ip_address}:${CODETRANS_LLM_SERVICE_PORT}/v1/chat/completions" \
-        "data: " \
-        "codetrans-llm-server" \
-        "codetrans-llm-server" \
-        '{"query":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:"}'
-
-}
-
-function validate_megaservice() {
-    # Curl the Mega Service
-    validate_services \
-        "${ip_address}:${CODETRANS_BACKEND_SERVICE_PORT}/v1/codetrans" \
-        "print" \
-        "codetrans-backend-server" \
-        "codetrans-backend-server" \
-        '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}\n"}'
-
-    # test the megeservice via nginx
-    validate_services \
-        "${ip_address}:${CODETRANS_NGINX_PORT}/v1/codetrans" \
-        "print" \
-        "codetrans-nginx-server" \
-        "codetrans-nginx-server" \
-        '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}\n"}'
-
-}
-
-function validate_frontend() {
-    cd $WORKPATH/ui/svelte
-    local conda_env_name="OPEA_e2e"
-    export PATH=${HOME}/miniconda3/bin/:$PATH
-    if conda info --envs | grep -q "$conda_env_name"; then
-        echo "$conda_env_name exist!"
-    else
-        conda create -n ${conda_env_name} python=3.12 -y
-    fi
-    source activate ${conda_env_name}
-
-    sed -i "s/localhost/$ip_address/g" playwright.config.ts
-
-    conda install -c conda-forge nodejs=22.6.0 -y
-    npm install && npm ci && npx playwright install --with-deps
-    node -v && npm -v && pip list
-
-    exit_status=0
-    npx playwright test || exit_status=$?
-
-    if [ $exit_status -ne 0 ]; then
-        echo "[TEST INFO]: ---------frontend test failed---------"
-        exit $exit_status
-    else
-        echo "[TEST INFO]: ---------frontend test passed---------"
-    fi
-}
-
-function stop_docker() {
-    cd $WORKPATH/docker_compose/amd/gpu/rocm/
-    docker compose -f compose_vllm.yaml stop && docker compose -f compose_vllm.yaml rm -f
-}
-
-function main() {
-
-    stop_docker
-
-    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    start_services
-
-    validate_microservices
-    validate_megaservice
-    validate_frontend
-
-    stop_docker
-    echo y | docker system prune
-
-}
-
-main
--- a/DocIndexRetriever/docker_compose/intel/cpu/xeon/README.md
+++ b/DocIndexRetriever/docker_compose/intel/cpu/xeon/README.md
@@ -48,14 +48,17 @@ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export RERANK_MODEL_ID="BAAI/bge-reranker-base"
 export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:6006"
 export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808"
+export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
 export REDIS_URL="redis://${host_ip}:6379"
 export INDEX_NAME="rag-redis"
+export MEGA_SERVICE_HOST_IP=${host_ip}
 export EMBEDDING_SERVICE_HOST_IP=${host_ip}
 export RETRIEVER_SERVICE_HOST_IP=${host_ip}
 export RERANK_SERVICE_HOST_IP=${host_ip}
+export LLM_SERVICE_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8000/v1/retrievaltool"
 export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
-cd GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon
+cd GenAIExamples/DocIndexRetriever/intel/cpu/xoen/
 docker compose up -d
 ```

@@ -66,18 +69,10 @@ In that case, start Docker Containers with compose_without_rerank.yaml
 export host_ip="YOUR IP ADDR"
 export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
 export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
-cd GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon
+cd GenAIExamples/DocIndexRetriever/intel/cpu/xoen/
 docker compose -f compose_without_rerank.yaml up -d
 ```

-To run the DocRetriever with Rerank pipeline using the Milvus vector database, use the compose_milvus.yaml configuration file and set the MILVUS_HOST environment variable.
-
-```bash
-export MILVUS_HOST=${host_ip}
-cd GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon
-docker compose -f compose_milvus.yaml up -d
-```
-
 ## 4. Validation

 Add Knowledge Base via HTTP Links:
--- a/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
@@ -32,7 +32,7 @@ services:
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      LOGFLAG: ${LOGFLAG}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${EMBEDDING_MODEL_ID} --auto-truncate"
    container_name: tei-embedding-server
    ports:
@@ -90,7 +90,7 @@ services:
      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
    restart: unless-stopped
  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${RERANK_MODEL_ID} --auto-truncate"
    container_name: tei-reranking-server
    ports:
@@ -148,9 +148,11 @@ services:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
      - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
      - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
      - RERANK_SERVICE_HOST_IP=${RERANK_SERVICE_HOST_IP}
+      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
    ipc: host
    restart: always

--- a/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose_milvus.yaml
+++ b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose_milvus.yaml
@@ -1,223 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-version: "3.8"
-
-services:
-  milvus-etcd:
-    container_name: milvus-etcd
-    image: quay.io/coreos/etcd:v3.5.5
-    environment:
-      - ETCD_AUTO_COMPACTION_MODE=revision
-      - ETCD_AUTO_COMPACTION_RETENTION=1000
-      - ETCD_QUOTA_BACKEND_BYTES=4294967296
-      - ETCD_SNAPSHOT_COUNT=50000
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
-    command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
-    healthcheck:
-      test: ["CMD", "etcdctl", "endpoint", "health"]
-      interval: 30s
-      timeout: 20s
-      retries: 3
-  milvus-minio:
-    container_name: milvus-minio
-    image: minio/minio:RELEASE.2023-03-20T20-16-18Z
-    environment:
-      MINIO_ACCESS_KEY: minioadmin
-      MINIO_SECRET_KEY: minioadmin
-    ports:
-      - "${MINIO_PORT1:-5044}:9001"
-      - "${MINIO_PORT2:-5043}:9000"
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
-    command: minio server /minio_data --console-address ":9001"
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
-      interval: 30s
-      timeout: 20s
-      retries: 3
-
-  milvus-standalone:
-    container_name: milvus-standalone
-    image: milvusdb/milvus:v2.4.6
-    command: ["milvus", "run", "standalone"]
-    security_opt:
-      - seccomp:unconfined
-    environment:
-      ETCD_ENDPOINTS: milvus-etcd:2379
-      MINIO_ADDRESS: milvus-minio:9000
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/config/milvus.yaml:/milvus/configs/milvus.yaml
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
-      interval: 30s
-      start_period: 90s
-      timeout: 20s
-      retries: 3
-    ports:
-      - "19530:19530"
-      - "${MILVUS_STANDALONE_PORT:-9091}:9091"
-    depends_on:
-      - "milvus-etcd"
-      - "milvus-minio"
-
-  dataprep-milvus:
-    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
-    container_name: dataprep-milvus-server
-    ports:
-      - "${DATAPREP_PORT:-6007}:5000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_MILVUS"
-      MILVUS_HOST: ${MILVUS_HOST}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      LOGFLAG: ${LOGFLAG}
-    restart: unless-stopped
-    depends_on:
-      tei-embedding-service:
-        condition: service_healthy
-      milvus-standalone:
-        condition: service_healthy
-      milvus-etcd:
-        condition: service_healthy
-      milvus-minio:
-        condition: service_healthy
-
-  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${EMBEDDING_MODEL_ID} --auto-truncate"
-    container_name: tei-embedding-server
-    ports:
-      - "6006:80"
-    volumes:
-      - "./data:/data"
-    shm_size: 1g
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      host_ip: ${host_ip}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://$host_ip:6006/health || exit 1"]
-      interval: 10s
-      timeout: 10s
-      retries: 60
-
-  embedding:
-    image: ${REGISTRY:-opea}/embedding:${TAG:-latest}
-    container_name: embedding-server
-    # volumes:
-    #   - $WORKDIR/GenAIExamples/DocIndexRetriever/docker_image_build/GenAIComps/comps:/home/comps
-    ports:
-      - "6000:6000"
-    ipc: host
-    depends_on:
-      tei-embedding-service:
-        condition: service_healthy
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-      LOGFLAG: ${LOGFLAG}
-    restart: unless-stopped
-
-  retriever:
-    image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
-    container_name: retriever-milvus-server
-    depends_on:
-      - milvus-standalone
-    ports:
-      - "7000:7000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      MILVUS_HOST: ${host_ip}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-      LOGFLAG: ${LOGFLAG}
-      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_MILVUS"
-    restart: unless-stopped
-
-  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${RERANK_MODEL_ID} --auto-truncate"
-    container_name: tei-reranking-server
-    ports:
-      - "8808:80"
-    volumes:
-      - "./data:/data"
-    shm_size: 1g
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-      host_ip: ${host_ip}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://$host_ip:8808/health || exit 1"]
-      interval: 10s
-      timeout: 10s
-      retries: 60
-
-  reranking:
-    image: ${REGISTRY:-opea}/reranking:${TAG:-latest}
-    container_name: reranking-tei-xeon-server
-    # volumes:
-    #   - $WORKDIR/GenAIExamples/DocIndexRetriever/docker_image_build/GenAIComps/comps:/home/user/comps
-    depends_on:
-      tei-reranking-service:
-        condition: service_healthy
-    ports:
-      - "8000:8000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      RERANK_TYPE: ${RERANK_TYPE}
-      TEI_RERANKING_ENDPOINT: ${TEI_RERANKING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-      LOGFLAG: ${LOGFLAG}
-    restart: unless-stopped
-
-  doc-index-retriever-server:
-    image: ${REGISTRY:-opea}/doc-index-retriever:${TAG:-latest}
-    container_name: doc-index-retriever-server
-    depends_on:
-      - milvus-standalone
-      - tei-embedding-service
-      - embedding
-      - retriever
-      - reranking
-    ports:
-      - "8889:8889"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
-      - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
-      - RERANK_SERVICE_HOST_IP=${RERANK_SERVICE_HOST_IP}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
-    ipc: host
-    restart: always
-
-networks:
-  default:
-    driver: bridge
--- a/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
+++ b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
@@ -30,7 +30,7 @@ services:
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      LOGFLAG: ${LOGFLAG}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${EMBEDDING_MODEL_ID} --auto-truncate"
    container_name: tei-embedding-server
    ports:
@@ -99,6 +99,7 @@ services:
      no_proxy: ${no_proxy}
      https_proxy: ${https_proxy}
      http_proxy: ${http_proxy}
+      MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP:-0.0.0.0}
      EMBEDDING_SERVICE_HOST_IP: embedding
      EMBEDDING_SERVICE_PORT: ${EMBEDDING_SERVER_PORT:-6000}
      RETRIEVER_SERVICE_HOST_IP: retriever
--- a/DocIndexRetriever/docker_compose/intel/cpu/xeon/config/milvus.yaml
+++ b/DocIndexRetriever/docker_compose/intel/cpu/xeon/config/milvus.yaml
@@ -1,811 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# Licensed to the LF AI & Data foundation under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Related configuration of etcd, used to store Milvus metadata & service discovery.
-etcd:
-  endpoints: localhost:2379
-  rootPath: by-dev # The root path where data is stored in etcd
-  metaSubPath: meta # metaRootPath = rootPath + '/' + metaSubPath
-  kvSubPath: kv # kvRootPath = rootPath + '/' + kvSubPath
-  log:
-    level: info # Only supports debug, info, warn, error, panic, or fatal. Default 'info'.
-    # path is one of:
-    #  - "default" as os.Stderr,
-    #  - "stderr" as os.Stderr,
-    #  - "stdout" as os.Stdout,
-    #  - file path to append server logs to.
-    # please adjust in embedded Milvus: /tmp/milvus/logs/etcd.log
-    path: stdout
-  ssl:
-    enabled: false # Whether to support ETCD secure connection mode
-    tlsCert: /path/to/etcd-client.pem # path to your cert file
-    tlsKey: /path/to/etcd-client-key.pem # path to your key file
-    tlsCACert: /path/to/ca.pem # path to your CACert file
-    # TLS min version
-    # Optional values: 1.0, 1.1, 1.2, 1.3。
-    # We recommend using version 1.2 and above.
-    tlsMinVersion: 1.3
-  requestTimeout: 10000 # Etcd operation timeout in milliseconds
-  use:
-    embed: false # Whether to enable embedded Etcd (an in-process EtcdServer).
-  data:
-    dir: default.etcd # Embedded Etcd only. please adjust in embedded Milvus: /tmp/milvus/etcdData/
-  auth:
-    enabled: false # Whether to enable authentication
-    userName: # username for etcd authentication
-    password: # password for etcd authentication
-
-metastore:
-  type: etcd # Default value: etcd, Valid values: [etcd, tikv]
-
-# Related configuration of tikv, used to store Milvus metadata.
-# Notice that when TiKV is enabled for metastore, you still need to have etcd for service discovery.
-# TiKV is a good option when the metadata size requires better horizontal scalability.
-tikv:
-  endpoints: 127.0.0.1:2389 # Note that the default pd port of tikv is 2379, which conflicts with etcd.
-  rootPath: by-dev # The root path where data is stored in tikv
-  metaSubPath: meta # metaRootPath = rootPath + '/' + metaSubPath
-  kvSubPath: kv # kvRootPath = rootPath + '/' + kvSubPath
-  requestTimeout: 10000 # ms, tikv request timeout
-  snapshotScanSize: 256 # batch size of tikv snapshot scan
-  ssl:
-    enabled: false # Whether to support TiKV secure connection mode
-    tlsCert: # path to your cert file
-    tlsKey: # path to your key file
-    tlsCACert: # path to your CACert file
-
-localStorage:
-  path: /var/lib/milvus/data/ # please adjust in embedded Milvus: /tmp/milvus/data/
-
-# Related configuration of MinIO/S3/GCS or any other service supports S3 API, which is responsible for data persistence for Milvus.
-# We refer to the storage service as MinIO/S3 in the following description for simplicity.
-minio:
-  address: localhost # Address of MinIO/S3
-  port: 9000 # Port of MinIO/S3
-  accessKeyID: minioadmin # accessKeyID of MinIO/S3
-  secretAccessKey: minioadmin # MinIO/S3 encryption string
-  useSSL: false # Access to MinIO/S3 with SSL
-  ssl:
-    tlsCACert: /path/to/public.crt # path to your CACert file
-  bucketName: a-bucket # Bucket name in MinIO/S3
-  rootPath: files # The root path where the message is stored in MinIO/S3
-  # Whether to useIAM role to access S3/GCS instead of access/secret keys
-  # For more information, refer to
-  # aws: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use.html
-  # gcp: https://cloud.google.com/storage/docs/access-control/iam
-  # aliyun (ack): https://www.alibabacloud.com/help/en/container-service-for-kubernetes/latest/use-rrsa-to-enforce-access-control
-  # aliyun (ecs): https://www.alibabacloud.com/help/en/elastic-compute-service/latest/attach-an-instance-ram-role
-  useIAM: false
-  # Cloud Provider of S3. Supports: "aws", "gcp", "aliyun".
-  # You can use "aws" for other cloud provider supports S3 API with signature v4, e.g.: minio
-  # You can use "gcp" for other cloud provider supports S3 API with signature v2
-  # You can use "aliyun" for other cloud provider uses virtual host style bucket
-  # When useIAM enabled, only "aws", "gcp", "aliyun" is supported for now
-  cloudProvider: aws
-  # Custom endpoint for fetch IAM role credentials. when useIAM is true & cloudProvider is "aws".
-  # Leave it empty if you want to use AWS default endpoint
-  iamEndpoint:
-  logLevel: fatal # Log level for aws sdk log. Supported level:  off, fatal, error, warn, info, debug, trace
-  region: # Specify minio storage system location region
-  useVirtualHost: false # Whether use virtual host mode for bucket
-  requestTimeoutMs: 10000 # minio timeout for request time in milliseconds
-  # The maximum number of objects requested per batch in minio ListObjects rpc,
-  # 0 means using oss client by default, decrease these configuration if ListObjects timeout
-  listObjectsMaxKeys: 0
-
-# Milvus supports four MQ: rocksmq(based on RockDB), natsmq(embedded nats-server), Pulsar and Kafka.
-# You can change your mq by setting mq.type field.
-# If you don't set mq.type field as default, there is a note about enabling priority if we config multiple mq in this file.
-# 1. standalone(local) mode: rocksmq(default) > natsmq > Pulsar > Kafka
-# 2. cluster mode:  Pulsar(default) > Kafka (rocksmq and natsmq is unsupported in cluster mode)
-mq:
-  # Default value: "default"
-  # Valid values: [default, pulsar, kafka, rocksmq, natsmq]
-  type: default
-  enablePursuitMode: true # Default value: "true"
-  pursuitLag: 10 # time tick lag threshold to enter pursuit mode, in seconds
-  pursuitBufferSize: 8388608 # pursuit mode buffer size in bytes
-  mqBufSize: 16 # MQ client consumer buffer length
-  dispatcher:
-    mergeCheckInterval: 1 # the interval time(in seconds) for dispatcher to check whether to merge
-    targetBufSize: 16 # the length of channel buffer for targe
-    maxTolerantLag: 3 # Default value: "3", the timeout(in seconds) that target sends msgPack
-
-# Related configuration of pulsar, used to manage Milvus logs of recent mutation operations, output streaming log, and provide log publish-subscribe services.
-pulsar:
-  address: localhost # Address of pulsar
-  port: 6650 # Port of Pulsar
-  webport: 80 # Web port of pulsar, if you connect directly without proxy, should use 8080
-  maxMessageSize: 5242880 # 5 * 1024 * 1024 Bytes, Maximum size of each message in pulsar.
-  tenant: public
-  namespace: default
-  requestTimeout: 60 # pulsar client global request timeout in seconds
-  enableClientMetrics: false # Whether to register pulsar client metrics into milvus metrics path.
-
-# If you want to enable kafka, needs to comment the pulsar configs
-# kafka:
-#   brokerList:
-#   saslUsername:
-#   saslPassword:
-#   saslMechanisms:
-#   securityProtocol:
-#   ssl:
-#     enabled: false # whether to enable ssl mode
-#     tlsCert:  # path to client's public key (PEM) used for authentication
-#     tlsKey:  # path to client's private key (PEM) used for authentication
-#     tlsCaCert:  # file or directory path to CA certificate(s) for verifying the broker's key
-#     tlsKeyPassword:  # private key passphrase for use with ssl.key.location and set_ssl_cert(), if any
-#   readTimeout: 10
-
-rocksmq:
-  # The path where the message is stored in rocksmq
-  # please adjust in embedded Milvus: /tmp/milvus/rdb_data
-  path: /var/lib/milvus/rdb_data
-  lrucacheratio: 0.06 # rocksdb cache memory ratio
-  rocksmqPageSize: 67108864 # 64 MB, 64 * 1024 * 1024 bytes, The size of each page of messages in rocksmq
-  retentionTimeInMinutes: 4320 # 3 days, 3 * 24 * 60 minutes, The retention time of the message in rocksmq.
-  retentionSizeInMB: 8192 # 8 GB, 8 * 1024 MB, The retention size of the message in rocksmq.
-  compactionInterval: 86400 # 1 day, trigger rocksdb compaction every day to remove deleted data
-  compressionTypes: 0,0,7,7,7 # compaction compression type, only support use 0,7. 0 means not compress, 7 will use zstd. Length of types means num of rocksdb level.
-
-# natsmq configuration.
-# more detail: https://docs.nats.io/running-a-nats-service/configuration
-natsmq:
-  server:
-    port: 4222 # Port for nats server listening
-    storeDir: /var/lib/milvus/nats # Directory to use for JetStream storage of nats
-    maxFileStore: 17179869184 # Maximum size of the 'file' storage
-    maxPayload: 8388608 # Maximum number of bytes in a message payload
-    maxPending: 67108864 # Maximum number of bytes buffered for a connection Applies to client connections
-    initializeTimeout: 4000 # waiting for initialization of natsmq finished
-    monitor:
-      trace: false # If true enable protocol trace log messages
-      debug: false # If true enable debug log messages
-      logTime: true # If set to false, log without timestamps.
-      logFile: /tmp/milvus/logs/nats.log # Log file path relative to .. of milvus binary if use relative path
-      logSizeLimit: 536870912 # Size in bytes after the log file rolls over to a new one
-    retention:
-      maxAge: 4320 # Maximum age of any message in the P-channel
-      maxBytes: # How many bytes the single P-channel may contain. Removing oldest messages if the P-channel exceeds this size
-      maxMsgs: # How many message the single P-channel may contain. Removing oldest messages if the P-channel exceeds this limit
-
-# Related configuration of rootCoord, used to handle data definition language (DDL) and data control language (DCL) requests
-rootCoord:
-  dmlChannelNum: 16 # The number of dml channels created at system startup
-  maxPartitionNum: 1024 # Maximum number of partitions in a collection
-  minSegmentSizeToEnableIndex: 1024 # It's a threshold. When the segment size is less than this value, the segment will not be indexed
-  enableActiveStandby: false
-  maxDatabaseNum: 64 # Maximum number of database
-  maxGeneralCapacity: 65536 # upper limit for the sum of of product of partitionNumber and shardNumber
-  gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
-  ip: # if not specified, use the first unicastable address
-  port: 53100
-  grpc:
-    serverMaxSendSize: 536870912
-    serverMaxRecvSize: 268435456
-    clientMaxSendSize: 268435456
-    clientMaxRecvSize: 536870912
-
-# Related configuration of proxy, used to validate client requests and reduce the returned results.
-proxy:
-  timeTickInterval: 200 # ms, the interval that proxy synchronize the time tick
-  healthCheckTimeout: 3000 # ms, the interval that to do component healthy check
-  msgStream:
-    timeTick:
-      bufSize: 512
-  maxNameLength: 255 # Maximum length of name for a collection or alias
-  # Maximum number of fields in a collection.
-  # As of today (2.2.0 and after) it is strongly DISCOURAGED to set maxFieldNum >= 64.
-  # So adjust at your risk!
-  maxFieldNum: 64
-  maxVectorFieldNum: 4 # Maximum number of vector fields in a collection.
-  maxShardNum: 16 # Maximum number of shards in a collection
-  maxDimension: 32768 # Maximum dimension of a vector
-  # Whether to produce gin logs.\n
-  # please adjust in embedded Milvus: false
-  ginLogging: true
-  ginLogSkipPaths: / # skip url path for gin log
-  maxTaskNum: 1024 # max task number of proxy task queue
-  mustUsePartitionKey: false # switch for whether proxy must use partition key for the collection
-  accessLog:
-    enable: false # if use access log
-    minioEnable: false # if upload sealed access log file to minio
-    localPath: /tmp/milvus_access
-    filename: # Log filename, leave empty to use stdout.
-    maxSize: 64 # Max size for a single file, in MB.
-    cacheSize: 10240 # Size of log of memory cache, in B
-    rotatedTime: 0 # Max time for single access log file in seconds
-    remotePath: access_log/ # File path in minIO
-    remoteMaxTime: 0 # Max time for log file in minIO, in hours
-    formatters:
-      base:
-        format: "[$time_now] [ACCESS] <$user_name: $user_addr> $method_name [status: $method_status] [code: $error_code] [sdk: $sdk_version] [msg: $error_msg] [traceID: $trace_id] [timeCost: $time_cost]"
-      query:
-        format: "[$time_now] [ACCESS] <$user_name: $user_addr> $method_name [status: $method_status] [code: $error_code] [sdk: $sdk_version] [msg: $error_msg] [traceID: $trace_id] [timeCost: $time_cost] [database: $database_name] [collection: $collection_name] [partitions: $partition_name] [expr: $method_expr]"
-        methods: "Query,Search,Delete"
-  connectionCheckIntervalSeconds: 120 # the interval time(in seconds) for connection manager to scan inactive client info
-  connectionClientInfoTTLSeconds: 86400 # inactive client info TTL duration, in seconds
-  maxConnectionNum: 10000 # the max client info numbers that proxy should manage, avoid too many client infos
-  gracefulStopTimeout: 30 # seconds. force stop node without graceful stop
-  slowQuerySpanInSeconds: 5 # query whose executed time exceeds the `slowQuerySpanInSeconds` can be considered slow, in seconds.
-  http:
-    enabled: true # Whether to enable the http server
-    debug_mode: false # Whether to enable http server debug mode
-    port: # high-level restful api
-    acceptTypeAllowInt64: true # high-level restful api, whether http client can deal with int64
-    enablePprof: true # Whether to enable pprof middleware on the metrics port
-  ip: # if not specified, use the first unicastable address
-  port: 19530
-  internalPort: 19529
-  grpc:
-    serverMaxSendSize: 268435456
-    serverMaxRecvSize: 67108864
-    clientMaxSendSize: 268435456
-    clientMaxRecvSize: 67108864
-
-# Related configuration of queryCoord, used to manage topology and load balancing for the query nodes, and handoff from growing segments to sealed segments.
-queryCoord:
-  taskMergeCap: 1
-  taskExecutionCap: 256
-  autoHandoff: true # Enable auto handoff
-  autoBalance: true # Enable auto balance
-  autoBalanceChannel: true # Enable auto balance channel
-  balancer: ScoreBasedBalancer # auto balancer used for segments on queryNodes
-  globalRowCountFactor: 0.1 # the weight used when balancing segments among queryNodes
-  scoreUnbalanceTolerationFactor: 0.05 # the least value for unbalanced extent between from and to nodes when doing balance
-  reverseUnBalanceTolerationFactor: 1.3 # the largest value for unbalanced extent between from and to nodes after doing balance
-  overloadedMemoryThresholdPercentage: 90 # The threshold percentage that memory overload
-  balanceIntervalSeconds: 60
-  memoryUsageMaxDifferencePercentage: 30
-  rowCountFactor: 0.4 # the row count weight used when balancing segments among queryNodes
-  segmentCountFactor: 0.4 # the segment count weight used when balancing segments among queryNodes
-  globalSegmentCountFactor: 0.1 # the segment count weight used when balancing segments among queryNodes
-  segmentCountMaxSteps: 50 # segment count based plan generator max steps
-  rowCountMaxSteps: 50 # segment count based plan generator max steps
-  randomMaxSteps: 10 # segment count based plan generator max steps
-  growingRowCountWeight: 4 # the memory weight of growing segment row count
-  balanceCostThreshold: 0.001 # the threshold of balance cost, if the difference of cluster's cost after executing the balance plan is less than this value, the plan will not be executed
-  checkSegmentInterval: 1000
-  checkChannelInterval: 1000
-  checkBalanceInterval: 10000
-  checkIndexInterval: 10000
-  channelTaskTimeout: 60000 # 1 minute
-  segmentTaskTimeout: 120000 # 2 minute
-  distPullInterval: 500
-  collectionObserverInterval: 200
-  checkExecutedFlagInterval: 100
-  heartbeatAvailableInterval: 10000 # 10s, Only QueryNodes which fetched heartbeats within the duration are available
-  loadTimeoutSeconds: 600
-  distRequestTimeout: 5000 # the request timeout for querycoord fetching data distribution from querynodes, in milliseconds
-  heatbeatWarningLag: 5000 # the lag value for querycoord report warning when last heartbeat is too old, in milliseconds
-  checkHandoffInterval: 5000
-  enableActiveStandby: false
-  checkInterval: 1000
-  checkHealthInterval: 3000 # 3s, the interval when query coord try to check health of query node
-  checkHealthRPCTimeout: 2000 # 100ms, the timeout of check health rpc to query node
-  brokerTimeout: 5000 # 5000ms, querycoord broker rpc timeout
-  collectionRecoverTimes: 3 # if collection recover times reach the limit during loading state, release it
-  observerTaskParallel: 16 # the parallel observer dispatcher task number
-  checkAutoBalanceConfigInterval: 10 # the interval of check auto balance config
-  checkNodeSessionInterval: 60 # the interval(in seconds) of check querynode cluster session
-  gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
-  enableStoppingBalance: true # whether enable stopping balance
-  channelExclusiveNodeFactor: 4 # the least node number for enable channel's exclusive mode
-  cleanExcludeSegmentInterval: 60 # the time duration of clean pipeline exclude segment which used for filter invalid data, in seconds
-  ip: # if not specified, use the first unicastable address
-  port: 19531
-  grpc:
-    serverMaxSendSize: 536870912
-    serverMaxRecvSize: 268435456
-    clientMaxSendSize: 268435456
-    clientMaxRecvSize: 536870912
-
-# Related configuration of queryNode, used to run hybrid search between vector and scalar data.
-queryNode:
-  stats:
-    publishInterval: 1000 # Interval for querynode to report node information (milliseconds)
-  segcore:
-    knowhereThreadPoolNumRatio: 4 # The number of threads in knowhere's thread pool. If disk is enabled, the pool size will multiply with knowhereThreadPoolNumRatio([1, 32]).
-    chunkRows: 128 # The number of vectors in a chunk.
-    interimIndex:
-      enableIndex: true # Enable segment build with index to accelerate vector search when segment is in growing or binlog.
-      nlist: 128 # temp index nlist, recommend to set sqrt(chunkRows), must smaller than chunkRows/8
-      nprobe: 16 # nprobe to search small index, based on your accuracy requirement, must smaller than nlist
-      memExpansionRate: 1.15 # extra memory needed by building interim index
-      buildParallelRate: 0.5 # the ratio of building interim index parallel matched with cpu num
-    knowhereScoreConsistency: false # Enable knowhere strong consistency score computation logic
-  loadMemoryUsageFactor: 1 # The multiply factor of calculating the memory usage while loading segments
-  enableDisk: false # enable querynode load disk index, and search on disk index
-  maxDiskUsagePercentage: 95
-  cache:
-    enabled: true
-    memoryLimit: 2147483648 # 2 GB, 2 * 1024 *1024 *1024
-    readAheadPolicy: willneed # The read ahead policy of chunk cache, options: `normal, random, sequential, willneed, dontneed`
-    # options: async, sync, disable.
-    # Specifies the necessity for warming up the chunk cache.
-    # 1. If set to "sync" or "async" the original vector data will be synchronously/asynchronously loaded into the
-    # chunk cache during the load process. This approach has the potential to substantially reduce query/search latency
-    # for a specific duration post-load, albeit accompanied by a concurrent increase in disk usage;
-    # 2. If set to "disable" original vector data will only be loaded into the chunk cache during search/query.
-    warmup: disable
-  mmap:
-    mmapEnabled: false # Enable mmap for loading data
-  lazyload:
-    enabled: false # Enable lazyload for loading data
-    waitTimeout: 30000 # max wait timeout duration in milliseconds before start to do lazyload search and retrieve
-    requestResourceTimeout: 5000 # max timeout in milliseconds for waiting request resource for lazy load, 5s by default
-    requestResourceRetryInterval: 2000 # retry interval in milliseconds for waiting request resource for lazy load, 2s by default
-    maxRetryTimes: 1 # max retry times for lazy load, 1 by default
-    maxEvictPerRetry: 1 # max evict count for lazy load, 1 by default
-  grouping:
-    enabled: true
-    maxNQ: 1000
-    topKMergeRatio: 20
-  scheduler:
-    receiveChanSize: 10240
-    unsolvedQueueSize: 10240
-    # maxReadConcurrentRatio is the concurrency ratio of read task (search task and query task).
-    # Max read concurrency would be the value of hardware.GetCPUNum * maxReadConcurrentRatio.
-    # It defaults to 2.0, which means max read concurrency would be the value of hardware.GetCPUNum * 2.
-    # Max read concurrency must greater than or equal to 1, and less than or equal to hardware.GetCPUNum * 100.
-    # (0, 100]
-    maxReadConcurrentRatio: 1
-    cpuRatio: 10 # ratio used to estimate read task cpu usage.
-    maxTimestampLag: 86400
-    scheduleReadPolicy:
-      # fifo: A FIFO queue support the schedule.
-      # user-task-polling:
-      # 	The user's tasks will be polled one by one and scheduled.
-      # 	Scheduling is fair on task granularity.
-      # 	The policy is based on the username for authentication.
-      # 	And an empty username is considered the same user.
-      # 	When there are no multi-users, the policy decay into FIFO"
-      name: fifo
-      taskQueueExpire: 60 # Control how long (many seconds) that queue retains since queue is empty
-      enableCrossUserGrouping: false # Enable Cross user grouping when using user-task-polling policy. (Disable it if user's task can not merge each other)
-      maxPendingTaskPerUser: 1024 # Max pending task per user in scheduler
-  dataSync:
-    flowGraph:
-      maxQueueLength: 16 # Maximum length of task queue in flowgraph
-      maxParallelism: 1024 # Maximum number of tasks executed in parallel in the flowgraph
-  enableSegmentPrune: false # use partition prune function on shard delegator
-  ip: # if not specified, use the first unicastable address
-  port: 21123
-  grpc:
-    serverMaxSendSize: 536870912
-    serverMaxRecvSize: 268435456
-    clientMaxSendSize: 268435456
-    clientMaxRecvSize: 536870912
-
-indexCoord:
-  bindIndexNodeMode:
-    enable: false
-    address: localhost:22930
-    withCred: false
-    nodeID: 0
-  segment:
-    minSegmentNumRowsToEnableIndex: 1024 # It's a threshold. When the segment num rows is less than this value, the segment will not be indexed
-
-indexNode:
-  scheduler:
-    buildParallel: 1
-  enableDisk: true # enable index node build disk vector index
-  maxDiskUsagePercentage: 95
-  ip: # if not specified, use the first unicastable address
-  port: 21121
-  grpc:
-    serverMaxSendSize: 536870912
-    serverMaxRecvSize: 268435456
-    clientMaxSendSize: 268435456
-    clientMaxRecvSize: 536870912
-
-dataCoord:
-  channel:
-    watchTimeoutInterval: 300 # Timeout on watching channels (in seconds). Datanode tickler update watch progress will reset timeout timer.
-    balanceWithRpc: true # Whether to enable balance with RPC, default to use etcd watch
-    legacyVersionWithoutRPCWatch: 2.4.1 # Datanodes <= this version are considered as legacy nodes, which doesn't have rpc based watch(). This is only used during rolling upgrade where legacy nodes won't get new channels
-    balanceSilentDuration: 300 # The duration after which the channel manager start background channel balancing
-    balanceInterval: 360 # The interval with which the channel manager check dml channel balance status
-    checkInterval: 1 # The interval in seconds with which the channel manager advances channel states
-    notifyChannelOperationTimeout: 5 # Timeout notifing channel operations (in seconds).
-  segment:
-    maxSize: 1024 # Maximum size of a segment in MB
-    diskSegmentMaxSize: 2048 # Maximum size of a segment in MB for collection which has Disk index
-    sealProportion: 0.12
-    assignmentExpiration: 2000 # The time of the assignment expiration in ms
-    allocLatestExpireAttempt: 200 # The time attempting to alloc latest lastExpire from rootCoord after restart
-    maxLife: 86400 # The max lifetime of segment in seconds, 24*60*60
-    # If a segment didn't accept dml records in maxIdleTime and the size of segment is greater than
-    # minSizeFromIdleToSealed, Milvus will automatically seal it.
-    # The max idle time of segment in seconds, 10*60.
-    maxIdleTime: 600
-    minSizeFromIdleToSealed: 16 # The min size in MB of segment which can be idle from sealed.
-    # The max number of binlog file for one segment, the segment will be sealed if
-    # the number of binlog file reaches to max value.
-    maxBinlogFileNumber: 32
-    smallProportion: 0.5 # The segment is considered as "small segment" when its # of rows is smaller than
-    # (smallProportion * segment max # of rows).
-    # A compaction will happen on small segments if the segment after compaction will have
-    compactableProportion: 0.85
-    # over (compactableProportion * segment max # of rows) rows.
-    # MUST BE GREATER THAN OR EQUAL TO <smallProportion>!!!
-    # During compaction, the size of segment # of rows is able to exceed segment max # of rows by (expansionRate-1) * 100%.
-    expansionRate: 1.25
-  autoUpgradeSegmentIndex: false # whether auto upgrade segment index to index engine's version
-  enableCompaction: true # Enable data segment compaction
-  compaction:
-    enableAutoCompaction: true
-    indexBasedCompaction: true
-    rpcTimeout: 10
-    maxParallelTaskNum: 10
-    workerMaxParallelTaskNum: 2
-    levelzero:
-      forceTrigger:
-        minSize: 8388608 # The minimum size in bytes to force trigger a LevelZero Compaction, default as 8MB
-        maxSize: 67108864 # The maxmum size in bytes to force trigger a LevelZero Compaction, default as 64MB
-        deltalogMinNum: 10 # The minimum number of deltalog files to force trigger a LevelZero Compaction
-        deltalogMaxNum: 30 # The maxmum number of deltalog files to force trigger a LevelZero Compaction, default as 30
-  enableGarbageCollection: true
-  gc:
-    interval: 3600 # gc interval in seconds
-    missingTolerance: 86400 # file meta missing tolerance duration in seconds, default to 24hr(1d)
-    dropTolerance: 10800 # file belongs to dropped entity tolerance duration in seconds. 3600
-    removeConcurrent: 32 # number of concurrent goroutines to remove dropped s3 objects
-    scanInterval: 168 # garbage collection scan residue interval in hours
-  enableActiveStandby: false
-  brokerTimeout: 5000 # 5000ms, dataCoord broker rpc timeout
-  autoBalance: true # Enable auto balance
-  checkAutoBalanceConfigInterval: 10 # the interval of check auto balance config
-  import:
-    filesPerPreImportTask: 2 # The maximum number of files allowed per pre-import task.
-    taskRetention: 10800 # The retention period in seconds for tasks in the Completed or Failed state.
-    maxSizeInMBPerImportTask: 6144 # To prevent generating of small segments, we will re-group imported files. This parameter represents the sum of file sizes in each group (each ImportTask).
-    scheduleInterval: 2 # The interval for scheduling import, measured in seconds.
-    checkIntervalHigh: 2 # The interval for checking import, measured in seconds, is set to a high frequency for the import checker.
-    checkIntervalLow: 120 # The interval for checking import, measured in seconds, is set to a low frequency for the import checker.
-    maxImportFileNumPerReq: 1024 # The maximum number of files allowed per single import request.
-    waitForIndex: true # Indicates whether the import operation waits for the completion of index building.
-  gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
-  ip: # if not specified, use the first unicastable address
-  port: 13333
-  grpc:
-    serverMaxSendSize: 536870912
-    serverMaxRecvSize: 268435456
-    clientMaxSendSize: 268435456
-    clientMaxRecvSize: 536870912
-
-dataNode:
-  dataSync:
-    flowGraph:
-      maxQueueLength: 16 # Maximum length of task queue in flowgraph
-      maxParallelism: 1024 # Maximum number of tasks executed in parallel in the flowgraph
-    maxParallelSyncMgrTasks: 256 # The max concurrent sync task number of datanode sync mgr globally
-    skipMode:
-      enable: true # Support skip some timetick message to reduce CPU usage
-      skipNum: 4 # Consume one for every n records skipped
-      coldTime: 60 # Turn on skip mode after there are only timetick msg for x seconds
-  segment:
-    insertBufSize: 16777216 # Max buffer size to flush for a single segment.
-    deleteBufBytes: 16777216 # Max buffer size in bytes to flush del for a single channel, default as 16MB
-    syncPeriod: 600 # The period to sync segments if buffer is not empty.
-  memory:
-    forceSyncEnable: true # Set true to force sync if memory usage is too high
-    forceSyncSegmentNum: 1 # number of segments to sync, segments with top largest buffer will be synced.
-    checkInterval: 3000 # the interval to check datanode memory usage, in milliseconds
-    forceSyncWatermark: 0.5 # memory watermark for standalone, upon reaching this watermark, segments will be synced.
-  timetick:
-    byRPC: true
-    interval: 500
-  channel:
-    # specify the size of global work pool of all channels
-    # if this parameter <= 0, will set it as the maximum number of CPUs that can be executing
-    # suggest to set it bigger on large collection numbers to avoid blocking
-    workPoolSize: -1
-    # specify the size of global work pool for channel checkpoint updating
-    # if this parameter <= 0, will set it as 10
-    updateChannelCheckpointMaxParallel: 10
-    updateChannelCheckpointInterval: 60 # the interval duration(in seconds) for datanode to update channel checkpoint of each channel
-    updateChannelCheckpointRPCTimeout: 20 # timeout in seconds for UpdateChannelCheckpoint RPC call
-    maxChannelCheckpointsPerPRC: 128 # The maximum number of channel checkpoints per UpdateChannelCheckpoint RPC.
-    channelCheckpointUpdateTickInSeconds: 10 # The frequency, in seconds, at which the channel checkpoint updater executes updates.
-  import:
-    maxConcurrentTaskNum: 16 # The maximum number of import/pre-import tasks allowed to run concurrently on a datanode.
-    maxImportFileSizeInGB: 16 # The maximum file size (in GB) for an import file, where an import file refers to either a Row-Based file or a set of Column-Based files.
-    readBufferSizeInMB: 16 # The data block size (in MB) read from chunk manager by the datanode during import.
-  compaction:
-    levelZeroBatchMemoryRatio: 0.05 # The minimal memory ratio of free memory for level zero compaction executing in batch mode
-  gracefulStopTimeout: 1800 # seconds. force stop node without graceful stop
-  ip: # if not specified, use the first unicastable address
-  port: 21124
-  grpc:
-    serverMaxSendSize: 536870912
-    serverMaxRecvSize: 268435456
-    clientMaxSendSize: 268435456
-    clientMaxRecvSize: 536870912
-
-# Configures the system log output.
-log:
-  level: info # Only supports debug, info, warn, error, panic, or fatal. Default 'info'.
-  file:
-    rootPath: # root dir path to put logs, default "" means no log file will print. please adjust in embedded Milvus: /tmp/milvus/logs
-    maxSize: 300 # MB
-    maxAge: 10 # Maximum time for log retention in day.
-    maxBackups: 20
-  format: text # text or json
-  stdout: true # Stdout enable or not
-
-grpc:
-  log:
-    level: WARNING
-  gracefulStopTimeout: 10 # second, time to wait graceful stop finish
-  client:
-    compressionEnabled: false
-    dialTimeout: 200
-    keepAliveTime: 10000
-    keepAliveTimeout: 20000
-    maxMaxAttempts: 10
-    initialBackoff: 0.2
-    maxBackoff: 10
-    minResetInterval: 1000
-    maxCancelError: 32
-    minSessionCheckInterval: 200
-
-# Configure the proxy tls enable.
-tls:
-  serverPemPath: configs/cert/server.pem
-  serverKeyPath: configs/cert/server.key
-  caPemPath: configs/cert/ca.pem
-
-common:
-  defaultPartitionName: _default # default partition name for a collection
-  defaultIndexName: _default_idx # default index name
-  entityExpiration: -1 # Entity expiration in seconds, CAUTION -1 means never expire
-  indexSliceSize: 16 # MB
-  threadCoreCoefficient:
-    highPriority: 10 # This parameter specify how many times the number of threads is the number of cores in high priority pool
-    middlePriority: 5 # This parameter specify how many times the number of threads is the number of cores in middle priority pool
-    lowPriority: 1 # This parameter specify how many times the number of threads is the number of cores in low priority pool
-  buildIndexThreadPoolRatio: 0.75
-  DiskIndex:
-    MaxDegree: 56
-    SearchListSize: 100
-    PQCodeBudgetGBRatio: 0.125
-    BuildNumThreadsRatio: 1
-    SearchCacheBudgetGBRatio: 0.1
-    LoadNumThreadRatio: 8
-    BeamWidthRatio: 4
-  gracefulTime: 5000 # milliseconds. it represents the interval (in ms) by which the request arrival time needs to be subtracted in the case of Bounded Consistency.
-  gracefulStopTimeout: 1800 # seconds. it will force quit the server if the graceful stop process is not completed during this time.
-  storageType: remote # please adjust in embedded Milvus: local, available values are [local, remote, opendal], value minio is deprecated, use remote instead
-  # Default value: auto
-  # Valid values: [auto, avx512, avx2, avx, sse4_2]
-  # This configuration is only used by querynode and indexnode, it selects CPU instruction set for Searching and Index-building.
-  simdType: auto
-  security:
-    authorizationEnabled: false
-    # The superusers will ignore some system check processes,
-    # like the old password verification when updating the credential
-    superUsers:
-    tlsMode: 0
-  session:
-    ttl: 30 # ttl value when session granting a lease to register service
-    retryTimes: 30 # retry times when session sending etcd requests
-  locks:
-    metrics:
-      enable: false # whether gather statistics for metrics locks
-    threshold:
-      info: 500 # minimum milliseconds for printing durations in info level
-      warn: 1000 # minimum milliseconds for printing durations in warn level
-  storage:
-    scheme: s3
-    enablev2: false
-  ttMsgEnabled: true # Whether the instance disable sending ts messages
-  traceLogMode: 0 # trace request info
-  bloomFilterSize: 100000 # bloom filter initial size
-  maxBloomFalsePositive: 0.001 # max false positive rate for bloom filter
-
-# QuotaConfig, configurations of Milvus quota and limits.
-# By default, we enable:
-#   1. TT protection;
-#   2. Memory protection.
-#   3. Disk quota protection.
-# You can enable:
-#   1. DML throughput limitation;
-#   2. DDL, DQL qps/rps limitation;
-#   3. DQL Queue length/latency protection;
-#   4. DQL result rate protection;
-# If necessary, you can also manually force to deny RW requests.
-quotaAndLimits:
-  enabled: true # `true` to enable quota and limits, `false` to disable.
-  # quotaCenterCollectInterval is the time interval that quotaCenter
-  # collects metrics from Proxies, Query cluster and Data cluster.
-  # seconds, (0 ~ 65536)
-  quotaCenterCollectInterval: 3
-  ddl:
-    enabled: false
-    collectionRate: -1 # qps, default no limit, rate for CreateCollection, DropCollection, LoadCollection, ReleaseCollection
-    partitionRate: -1 # qps, default no limit, rate for CreatePartition, DropPartition, LoadPartition, ReleasePartition
-    db:
-      collectionRate: -1 # qps of db level , default no limit, rate for CreateCollection, DropCollection, LoadCollection, ReleaseCollection
-      partitionRate: -1 # qps of db level, default no limit, rate for CreatePartition, DropPartition, LoadPartition, ReleasePartition
-  indexRate:
-    enabled: false
-    max: -1 # qps, default no limit, rate for CreateIndex, DropIndex
-    db:
-      max: -1 # qps of db level, default no limit, rate for CreateIndex, DropIndex
-  flushRate:
-    enabled: true
-    max: -1 # qps, default no limit, rate for flush
-    collection:
-      max: 0.1 # qps, default no limit, rate for flush at collection level.
-    db:
-      max: -1 # qps of db level, default no limit, rate for flush
-  compactionRate:
-    enabled: false
-    max: -1 # qps, default no limit, rate for manualCompaction
-    db:
-      max: -1 # qps of db level, default no limit, rate for manualCompaction
-  dml:
-    # dml limit rates, default no limit.
-    # The maximum rate will not be greater than max.
-    enabled: false
-    insertRate:
-      max: -1 # MB/s, default no limit
-      db:
-        max: -1 # MB/s, default no limit
-      collection:
-        max: -1 # MB/s, default no limit
-      partition:
-        max: -1 # MB/s, default no limit
-    upsertRate:
-      max: -1 # MB/s, default no limit
-      db:
-        max: -1 # MB/s, default no limit
-      collection:
-        max: -1 # MB/s, default no limit
-      partition:
-        max: -1 # MB/s, default no limit
-    deleteRate:
-      max: -1 # MB/s, default no limit
-      db:
-        max: -1 # MB/s, default no limit
-      collection:
-        max: -1 # MB/s, default no limit
-      partition:
-        max: -1 # MB/s, default no limit
-    bulkLoadRate:
-      max: -1 # MB/s, default no limit, not support yet. TODO: limit bulkLoad rate
-      db:
-        max: -1 # MB/s, default no limit, not support yet. TODO: limit db bulkLoad rate
-      collection:
-        max: -1 # MB/s, default no limit, not support yet. TODO: limit collection bulkLoad rate
-      partition:
-        max: -1 # MB/s, default no limit, not support yet. TODO: limit partition bulkLoad rate
-  dql:
-    # dql limit rates, default no limit.
-    # The maximum rate will not be greater than max.
-    enabled: false
-    searchRate:
-      max: -1 # vps (vectors per second), default no limit
-      db:
-        max: -1 # vps (vectors per second), default no limit
-      collection:
-        max: -1 # vps (vectors per second), default no limit
-      partition:
-        max: -1 # vps (vectors per second), default no limit
-    queryRate:
-      max: -1 # qps, default no limit
-      db:
-        max: -1 # qps, default no limit
-      collection:
-        max: -1 # qps, default no limit
-      partition:
-        max: -1 # qps, default no limit
-  limits:
-    maxCollectionNum: 65536
-    maxCollectionNumPerDB: 65536
-    maxInsertSize: -1 # maximum size of a single insert request, in bytes, -1 means no limit
-    maxResourceGroupNumOfQueryNode: 1024 # maximum number of resource groups of query nodes
-  limitWriting:
-    # forceDeny false means dml requests are allowed (except for some
-    # specific conditions, such as memory of nodes to water marker), true means always reject all dml requests.
-    forceDeny: false
-    ttProtection:
-      enabled: false
-      # maxTimeTickDelay indicates the backpressure for DML Operations.
-      # DML rates would be reduced according to the ratio of time tick delay to maxTimeTickDelay,
-      # if time tick delay is greater than maxTimeTickDelay, all DML requests would be rejected.
-      # seconds
-      maxTimeTickDelay: 300
-    memProtection:
-      # When memory usage > memoryHighWaterLevel, all dml requests would be rejected;
-      # When memoryLowWaterLevel < memory usage < memoryHighWaterLevel, reduce the dml rate;
-      # When memory usage < memoryLowWaterLevel, no action.
-      enabled: true
-      dataNodeMemoryLowWaterLevel: 0.85 # (0, 1], memoryLowWaterLevel in DataNodes
-      dataNodeMemoryHighWaterLevel: 0.95 # (0, 1], memoryHighWaterLevel in DataNodes
-      queryNodeMemoryLowWaterLevel: 0.85 # (0, 1], memoryLowWaterLevel in QueryNodes
-      queryNodeMemoryHighWaterLevel: 0.95 # (0, 1], memoryHighWaterLevel in QueryNodes
-    growingSegmentsSizeProtection:
-      # No action will be taken if the growing segments size is less than the low watermark.
-      # When the growing segments size exceeds the low watermark, the dml rate will be reduced,
-      # but the rate will not be lower than minRateRatio * dmlRate.
-      enabled: false
-      minRateRatio: 0.5
-      lowWaterLevel: 0.2
-      highWaterLevel: 0.4
-    diskProtection:
-      enabled: true # When the total file size of object storage is greater than `diskQuota`, all dml requests would be rejected;
-      diskQuota: -1 # MB, (0, +inf), default no limit
-      diskQuotaPerDB: -1 # MB, (0, +inf), default no limit
-      diskQuotaPerCollection: -1 # MB, (0, +inf), default no limit
-      diskQuotaPerPartition: -1 # MB, (0, +inf), default no limit
-  limitReading:
-    # forceDeny false means dql requests are allowed (except for some
-    # specific conditions, such as collection has been dropped), true means always reject all dql requests.
-    forceDeny: false
-    queueProtection:
-      enabled: false
-      # nqInQueueThreshold indicated that the system was under backpressure for Search/Query path.
-      # If NQ in any QueryNode's queue is greater than nqInQueueThreshold, search&query rates would gradually cool off
-      # until the NQ in queue no longer exceeds nqInQueueThreshold. We think of the NQ of query request as 1.
-      # int, default no limit
-      nqInQueueThreshold: -1
-      # queueLatencyThreshold indicated that the system was under backpressure for Search/Query path.
-      # If dql latency of queuing is greater than queueLatencyThreshold, search&query rates would gradually cool off
-      # until the latency of queuing no longer exceeds queueLatencyThreshold.
-      # The latency here refers to the averaged latency over a period of time.
-      # milliseconds, default no limit
-      queueLatencyThreshold: -1
-    resultProtection:
-      enabled: false
-      # maxReadResultRate indicated that the system was under backpressure for Search/Query path.
-      # If dql result rate is greater than maxReadResultRate, search&query rates would gradually cool off
-      # until the read result rate no longer exceeds maxReadResultRate.
-      # MB/s, default no limit
-      maxReadResultRate: -1
-      maxReadResultRatePerDB: -1
-      maxReadResultRatePerCollection: -1
-    # colOffSpeed is the speed of search&query rates cool off.
-    # (0, 1]
-    coolOffSpeed: 0.9
-
-trace:
-  # trace exporter type, default is stdout,
-  # optional values: ['noop','stdout', 'jaeger', 'otlp']
-  exporter: noop
-  # fraction of traceID based sampler,
-  # optional values: [0, 1]
-  # Fractions >= 1 will always sample. Fractions < 0 are treated as zero.
-  sampleFraction: 0
-  jaeger:
-    url: # when exporter is jaeger should set the jaeger's URL
-  otlp:
-    endpoint: # example: "127.0.0.1:4318"
-    secure: true
-
-#when using GPU indexing, Milvus will utilize a memory pool to avoid frequent memory allocation and deallocation.
-#here, you can set the size of the memory occupied by the memory pool, with the unit being MB.
-#note that there is a possibility of Milvus crashing when the actual memory demand exceeds the value set by maxMemSize.
-#if initMemSize and MaxMemSize both set zero,
-#milvus will automatically initialize half of the available GPU memory,
-#maxMemSize will the whole available GPU memory.
-gpu:
-  initMemSize: # Gpu Memory Pool init size
-  maxMemSize: # Gpu Memory Pool Max size
--- a/DocIndexRetriever/docker_compose/intel/hpu/gaudi/README.md
+++ b/DocIndexRetriever/docker_compose/intel/hpu/gaudi/README.md
@@ -48,11 +48,14 @@ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export RERANK_MODEL_ID="BAAI/bge-reranker-base"
 export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:8090"
 export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808"
+export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
 export REDIS_URL="redis://${host_ip}:6379"
 export INDEX_NAME="rag-redis"
+export MEGA_SERVICE_HOST_IP=${host_ip}
 export EMBEDDING_SERVICE_HOST_IP=${host_ip}
 export RETRIEVER_SERVICE_HOST_IP=${host_ip}
 export RERANK_SERVICE_HOST_IP=${host_ip}
+export LLM_SERVICE_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8000/v1/retrievaltool"
 export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
 export llm_hardware='cpu/xeon' #cpu/xeon, xpu, hpu/gaudi
@@ -60,14 +63,6 @@ cd GenAIExamples/DocIndexRetriever/intel/hpu/gaudi/
 docker compose up -d
 ```

-To run the DocRetriever with Rerank pipeline using the Milvus vector database, use the compose_milvus.yaml configuration file and set the MILVUS_HOST environment variable.
-
-```bash
-export MILVUS_HOST=${host_ip}
-cd GenAIExamples/DocIndexRetriever/docker_compose/intel/hpu/gaudi
-docker compose -f compose_milvus.yaml up -d
-```
-
 ## 4. Validation

 Add Knowledge Base via HTTP Links:
--- a/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -89,7 +89,7 @@ services:
      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
    restart: unless-stopped
  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${RERANK_MODEL_ID} --auto-truncate"
    container_name: tei-reranking-gaudi-server
    ports:
@@ -145,9 +145,11 @@ services:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
      - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
      - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
      - RERANK_SERVICE_HOST_IP=${RERANK_SERVICE_HOST_IP}
+      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
      - LOGFLAG=${LOGFLAG}
    ipc: host
    restart: always
--- a/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose_milvus.yaml
+++ b/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose_milvus.yaml
@@ -1,229 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-version: "3.8"
-
-services:
-  milvus-etcd:
-    container_name: milvus-etcd
-    image: quay.io/coreos/etcd:v3.5.5
-    environment:
-      - ETCD_AUTO_COMPACTION_MODE=revision
-      - ETCD_AUTO_COMPACTION_RETENTION=1000
-      - ETCD_QUOTA_BACKEND_BYTES=4294967296
-      - ETCD_SNAPSHOT_COUNT=50000
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
-    command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
-    healthcheck:
-      test: ["CMD", "etcdctl", "endpoint", "health"]
-      interval: 30s
-      timeout: 20s
-      retries: 3
-  milvus-minio:
-    container_name: milvus-minio
-    image: minio/minio:RELEASE.2023-03-20T20-16-18Z
-    environment:
-      MINIO_ACCESS_KEY: minioadmin
-      MINIO_SECRET_KEY: minioadmin
-    ports:
-      - "${MINIO_PORT1:-5044}:9001"
-      - "${MINIO_PORT2:-5043}:9000"
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
-    command: minio server /minio_data --console-address ":9001"
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
-      interval: 30s
-      timeout: 20s
-      retries: 3
-
-  milvus-standalone:
-    container_name: milvus-standalone
-    image: milvusdb/milvus:v2.4.6
-    command: ["milvus", "run", "standalone"]
-    security_opt:
-      - seccomp:unconfined
-    environment:
-      ETCD_ENDPOINTS: milvus-etcd:2379
-      MINIO_ADDRESS: milvus-minio:9000
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/config/milvus.yaml:/milvus/configs/milvus.yaml
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
-      interval: 30s
-      start_period: 90s
-      timeout: 20s
-      retries: 3
-    ports:
-      - "19530:19530"
-      - "${MILVUS_STANDALONE_PORT:-9091}:9091"
-    depends_on:
-      - "milvus-etcd"
-      - "milvus-minio"
-
-  dataprep-milvus:
-    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
-    container_name: dataprep-milvus-server
-    ports:
-      - "${DATAPREP_PORT:-6007}:5000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_MILVUS"
-      MILVUS_HOST: ${MILVUS_HOST}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      LOGFLAG: ${LOGFLAG}
-    restart: unless-stopped
-    depends_on:
-      tei-embedding-service:
-        condition: service_healthy
-      milvus-standalone:
-        condition: service_healthy
-      milvus-etcd:
-        condition: service_healthy
-      milvus-minio:
-        condition: service_healthy
-
-  tei-embedding-service:
-    image: ghcr.io/huggingface/tei-gaudi:1.5.0
-    entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${EMBEDDING_MODEL_ID} --auto-truncate"
-    container_name: tei-embedding-gaudi-server
-    ports:
-      - "8090:80"
-    volumes:
-      - "./data:/data"
-    runtime: habana
-    cap_add:
-      - SYS_NICE
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HABANA_VISIBLE_DEVICES: ${tei_embedding_devices:-all}
-      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      MAX_WARMUP_SEQUENCE_LENGTH: 512
-      INIT_HCCL_ON_ACQUIRE: 0
-      ENABLE_EXPERIMENTAL_FLAGS: true
-      host_ip: ${host_ip}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://$host_ip:8090/health || exit 1"]
-      interval: 10s
-      timeout: 10s
-      retries: 60
-  embedding:
-    image: ${REGISTRY:-opea}/embedding:${TAG:-latest}
-    container_name: embedding-server
-    # volumes:
-    #   - $WORKDIR/GenAIExamples/DocIndexRetriever/docker_image_build/GenAIComps/comps:/home/comps
-    ports:
-      - "6000:6000"
-    ipc: host
-    depends_on:
-      tei-embedding-service:
-        condition: service_healthy
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-      LOGFLAG: ${LOGFLAG}
-    restart: unless-stopped
-
-  retriever:
-    image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
-    container_name: retriever-milvus-server
-    depends_on:
-      - milvus-standalone
-    ports:
-      - "7000:7000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      MILVUS_HOST: ${host_ip}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-      LOGFLAG: ${LOGFLAG}
-      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_MILVUS"
-    restart: unless-stopped
-
-  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${RERANK_MODEL_ID} --auto-truncate"
-    container_name: tei-reranking-gaudi-server
-    ports:
-      - "8808:80"
-    volumes:
-      - "./data:/data"
-    shm_size: 1g
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-      host_ip: ${host_ip}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://$host_ip:8808/health || exit 1"]
-      interval: 10s
-      timeout: 10s
-      retries: 60
-
-  reranking:
-    image: ${REGISTRY:-opea}/reranking:${TAG:-latest}
-    container_name: reranking-tei-gaudi-server
-    # volumes:
-    #   - $WORKDIR/GenAIExamples/DocIndexRetriever/docker_image_build/GenAIComps/comps:/home/user/comps
-    depends_on:
-      tei-reranking-service:
-        condition: service_healthy
-    ports:
-      - "8000:8000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      RERANK_TYPE: ${RERANK_TYPE}
-      TEI_RERANKING_ENDPOINT: ${TEI_RERANKING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-      LOGFLAG: ${LOGFLAG}
-    restart: unless-stopped
-
-  doc-index-retriever-server:
-    image: ${REGISTRY:-opea}/doc-index-retriever:${TAG:-latest}
-    container_name: doc-index-retriever-server
-    depends_on:
-      - milvus-standalone
-      - tei-embedding-service
-      - embedding
-      - retriever
-      - reranking
-    ports:
-      - "8889:8889"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
-      - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
-      - RERANK_SERVICE_HOST_IP=${RERANK_SERVICE_HOST_IP}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
-    ipc: host
-    restart: always
-
-networks:
-  default:
-    driver: bridge
--- a/DocIndexRetriever/docker_compose/intel/hpu/gaudi/config/milvus.yaml
+++ b/DocIndexRetriever/docker_compose/intel/hpu/gaudi/config/milvus.yaml
@@ -1,811 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# Licensed to the LF AI & Data foundation under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Related configuration of etcd, used to store Milvus metadata & service discovery.
-etcd:
-  endpoints: localhost:2379
-  rootPath: by-dev # The root path where data is stored in etcd
-  metaSubPath: meta # metaRootPath = rootPath + '/' + metaSubPath
-  kvSubPath: kv # kvRootPath = rootPath + '/' + kvSubPath
-  log:
-    level: info # Only supports debug, info, warn, error, panic, or fatal. Default 'info'.
-    # path is one of:
-    #  - "default" as os.Stderr,
-    #  - "stderr" as os.Stderr,
-    #  - "stdout" as os.Stdout,
-    #  - file path to append server logs to.
-    # please adjust in embedded Milvus: /tmp/milvus/logs/etcd.log
-    path: stdout
-  ssl:
-    enabled: false # Whether to support ETCD secure connection mode
-    tlsCert: /path/to/etcd-client.pem # path to your cert file
-    tlsKey: /path/to/etcd-client-key.pem # path to your key file
-    tlsCACert: /path/to/ca.pem # path to your CACert file
-    # TLS min version
-    # Optional values: 1.0, 1.1, 1.2, 1.3。
-    # We recommend using version 1.2 and above.
-    tlsMinVersion: 1.3
-  requestTimeout: 10000 # Etcd operation timeout in milliseconds
-  use:
-    embed: false # Whether to enable embedded Etcd (an in-process EtcdServer).
-  data:
-    dir: default.etcd # Embedded Etcd only. please adjust in embedded Milvus: /tmp/milvus/etcdData/
-  auth:
-    enabled: false # Whether to enable authentication
-    userName: # username for etcd authentication
-    password: # password for etcd authentication
-
-metastore:
-  type: etcd # Default value: etcd, Valid values: [etcd, tikv]
-
-# Related configuration of tikv, used to store Milvus metadata.
-# Notice that when TiKV is enabled for metastore, you still need to have etcd for service discovery.
-# TiKV is a good option when the metadata size requires better horizontal scalability.
-tikv:
-  endpoints: 127.0.0.1:2389 # Note that the default pd port of tikv is 2379, which conflicts with etcd.
-  rootPath: by-dev # The root path where data is stored in tikv
-  metaSubPath: meta # metaRootPath = rootPath + '/' + metaSubPath
-  kvSubPath: kv # kvRootPath = rootPath + '/' + kvSubPath
-  requestTimeout: 10000 # ms, tikv request timeout
-  snapshotScanSize: 256 # batch size of tikv snapshot scan
-  ssl:
-    enabled: false # Whether to support TiKV secure connection mode
-    tlsCert: # path to your cert file
-    tlsKey: # path to your key file
-    tlsCACert: # path to your CACert file
-
-localStorage:
-  path: /var/lib/milvus/data/ # please adjust in embedded Milvus: /tmp/milvus/data/
-
-# Related configuration of MinIO/S3/GCS or any other service supports S3 API, which is responsible for data persistence for Milvus.
-# We refer to the storage service as MinIO/S3 in the following description for simplicity.
-minio:
-  address: localhost # Address of MinIO/S3
-  port: 9000 # Port of MinIO/S3
-  accessKeyID: minioadmin # accessKeyID of MinIO/S3
-  secretAccessKey: minioadmin # MinIO/S3 encryption string
-  useSSL: false # Access to MinIO/S3 with SSL
-  ssl:
-    tlsCACert: /path/to/public.crt # path to your CACert file
-  bucketName: a-bucket # Bucket name in MinIO/S3
-  rootPath: files # The root path where the message is stored in MinIO/S3
-  # Whether to useIAM role to access S3/GCS instead of access/secret keys
-  # For more information, refer to
-  # aws: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use.html
-  # gcp: https://cloud.google.com/storage/docs/access-control/iam
-  # aliyun (ack): https://www.alibabacloud.com/help/en/container-service-for-kubernetes/latest/use-rrsa-to-enforce-access-control
-  # aliyun (ecs): https://www.alibabacloud.com/help/en/elastic-compute-service/latest/attach-an-instance-ram-role
-  useIAM: false
-  # Cloud Provider of S3. Supports: "aws", "gcp", "aliyun".
-  # You can use "aws" for other cloud provider supports S3 API with signature v4, e.g.: minio
-  # You can use "gcp" for other cloud provider supports S3 API with signature v2
-  # You can use "aliyun" for other cloud provider uses virtual host style bucket
-  # When useIAM enabled, only "aws", "gcp", "aliyun" is supported for now
-  cloudProvider: aws
-  # Custom endpoint for fetch IAM role credentials. when useIAM is true & cloudProvider is "aws".
-  # Leave it empty if you want to use AWS default endpoint
-  iamEndpoint:
-  logLevel: fatal # Log level for aws sdk log. Supported level:  off, fatal, error, warn, info, debug, trace
-  region: # Specify minio storage system location region
-  useVirtualHost: false # Whether use virtual host mode for bucket
-  requestTimeoutMs: 10000 # minio timeout for request time in milliseconds
-  # The maximum number of objects requested per batch in minio ListObjects rpc,
-  # 0 means using oss client by default, decrease these configuration if ListObjects timeout
-  listObjectsMaxKeys: 0
-
-# Milvus supports four MQ: rocksmq(based on RockDB), natsmq(embedded nats-server), Pulsar and Kafka.
-# You can change your mq by setting mq.type field.
-# If you don't set mq.type field as default, there is a note about enabling priority if we config multiple mq in this file.
-# 1. standalone(local) mode: rocksmq(default) > natsmq > Pulsar > Kafka
-# 2. cluster mode:  Pulsar(default) > Kafka (rocksmq and natsmq is unsupported in cluster mode)
-mq:
-  # Default value: "default"
-  # Valid values: [default, pulsar, kafka, rocksmq, natsmq]
-  type: default
-  enablePursuitMode: true # Default value: "true"
-  pursuitLag: 10 # time tick lag threshold to enter pursuit mode, in seconds
-  pursuitBufferSize: 8388608 # pursuit mode buffer size in bytes
-  mqBufSize: 16 # MQ client consumer buffer length
-  dispatcher:
-    mergeCheckInterval: 1 # the interval time(in seconds) for dispatcher to check whether to merge
-    targetBufSize: 16 # the length of channel buffer for targe
-    maxTolerantLag: 3 # Default value: "3", the timeout(in seconds) that target sends msgPack
-
-# Related configuration of pulsar, used to manage Milvus logs of recent mutation operations, output streaming log, and provide log publish-subscribe services.
-pulsar:
-  address: localhost # Address of pulsar
-  port: 6650 # Port of Pulsar
-  webport: 80 # Web port of pulsar, if you connect directly without proxy, should use 8080
-  maxMessageSize: 5242880 # 5 * 1024 * 1024 Bytes, Maximum size of each message in pulsar.
-  tenant: public
-  namespace: default
-  requestTimeout: 60 # pulsar client global request timeout in seconds
-  enableClientMetrics: false # Whether to register pulsar client metrics into milvus metrics path.
-
-# If you want to enable kafka, needs to comment the pulsar configs
-# kafka:
-#   brokerList:
-#   saslUsername:
-#   saslPassword:
-#   saslMechanisms:
-#   securityProtocol:
-#   ssl:
-#     enabled: false # whether to enable ssl mode
-#     tlsCert:  # path to client's public key (PEM) used for authentication
-#     tlsKey:  # path to client's private key (PEM) used for authentication
-#     tlsCaCert:  # file or directory path to CA certificate(s) for verifying the broker's key
-#     tlsKeyPassword:  # private key passphrase for use with ssl.key.location and set_ssl_cert(), if any
-#   readTimeout: 10
-
-rocksmq:
-  # The path where the message is stored in rocksmq
-  # please adjust in embedded Milvus: /tmp/milvus/rdb_data
-  path: /var/lib/milvus/rdb_data
-  lrucacheratio: 0.06 # rocksdb cache memory ratio
-  rocksmqPageSize: 67108864 # 64 MB, 64 * 1024 * 1024 bytes, The size of each page of messages in rocksmq
-  retentionTimeInMinutes: 4320 # 3 days, 3 * 24 * 60 minutes, The retention time of the message in rocksmq.
-  retentionSizeInMB: 8192 # 8 GB, 8 * 1024 MB, The retention size of the message in rocksmq.
-  compactionInterval: 86400 # 1 day, trigger rocksdb compaction every day to remove deleted data
-  compressionTypes: 0,0,7,7,7 # compaction compression type, only support use 0,7. 0 means not compress, 7 will use zstd. Length of types means num of rocksdb level.
-
-# natsmq configuration.
-# more detail: https://docs.nats.io/running-a-nats-service/configuration
-natsmq:
-  server:
-    port: 4222 # Port for nats server listening
-    storeDir: /var/lib/milvus/nats # Directory to use for JetStream storage of nats
-    maxFileStore: 17179869184 # Maximum size of the 'file' storage
-    maxPayload: 8388608 # Maximum number of bytes in a message payload
-    maxPending: 67108864 # Maximum number of bytes buffered for a connection Applies to client connections
-    initializeTimeout: 4000 # waiting for initialization of natsmq finished
-    monitor:
-      trace: false # If true enable protocol trace log messages
-      debug: false # If true enable debug log messages
-      logTime: true # If set to false, log without timestamps.
-      logFile: /tmp/milvus/logs/nats.log # Log file path relative to .. of milvus binary if use relative path
-      logSizeLimit: 536870912 # Size in bytes after the log file rolls over to a new one
-    retention:
-      maxAge: 4320 # Maximum age of any message in the P-channel
-      maxBytes: # How many bytes the single P-channel may contain. Removing oldest messages if the P-channel exceeds this size
-      maxMsgs: # How many message the single P-channel may contain. Removing oldest messages if the P-channel exceeds this limit
-
-# Related configuration of rootCoord, used to handle data definition language (DDL) and data control language (DCL) requests
-rootCoord:
-  dmlChannelNum: 16 # The number of dml channels created at system startup
-  maxPartitionNum: 1024 # Maximum number of partitions in a collection
-  minSegmentSizeToEnableIndex: 1024 # It's a threshold. When the segment size is less than this value, the segment will not be indexed
-  enableActiveStandby: false
-  maxDatabaseNum: 64 # Maximum number of database
-  maxGeneralCapacity: 65536 # upper limit for the sum of of product of partitionNumber and shardNumber
-  gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
-  ip: # if not specified, use the first unicastable address
-  port: 53100
-  grpc:
-    serverMaxSendSize: 536870912
-    serverMaxRecvSize: 268435456
-    clientMaxSendSize: 268435456
-    clientMaxRecvSize: 536870912
-
-# Related configuration of proxy, used to validate client requests and reduce the returned results.
-proxy:
-  timeTickInterval: 200 # ms, the interval that proxy synchronize the time tick
-  healthCheckTimeout: 3000 # ms, the interval that to do component healthy check
-  msgStream:
-    timeTick:
-      bufSize: 512
-  maxNameLength: 255 # Maximum length of name for a collection or alias
-  # Maximum number of fields in a collection.
-  # As of today (2.2.0 and after) it is strongly DISCOURAGED to set maxFieldNum >= 64.
-  # So adjust at your risk!
-  maxFieldNum: 64
-  maxVectorFieldNum: 4 # Maximum number of vector fields in a collection.
-  maxShardNum: 16 # Maximum number of shards in a collection
-  maxDimension: 32768 # Maximum dimension of a vector
-  # Whether to produce gin logs.\n
-  # please adjust in embedded Milvus: false
-  ginLogging: true
-  ginLogSkipPaths: / # skip url path for gin log
-  maxTaskNum: 1024 # max task number of proxy task queue
-  mustUsePartitionKey: false # switch for whether proxy must use partition key for the collection
-  accessLog:
-    enable: false # if use access log
-    minioEnable: false # if upload sealed access log file to minio
-    localPath: /tmp/milvus_access
-    filename: # Log filename, leave empty to use stdout.
-    maxSize: 64 # Max size for a single file, in MB.
-    cacheSize: 10240 # Size of log of memory cache, in B
-    rotatedTime: 0 # Max time for single access log file in seconds
-    remotePath: access_log/ # File path in minIO
-    remoteMaxTime: 0 # Max time for log file in minIO, in hours
-    formatters:
-      base:
-        format: "[$time_now] [ACCESS] <$user_name: $user_addr> $method_name [status: $method_status] [code: $error_code] [sdk: $sdk_version] [msg: $error_msg] [traceID: $trace_id] [timeCost: $time_cost]"
-      query:
-        format: "[$time_now] [ACCESS] <$user_name: $user_addr> $method_name [status: $method_status] [code: $error_code] [sdk: $sdk_version] [msg: $error_msg] [traceID: $trace_id] [timeCost: $time_cost] [database: $database_name] [collection: $collection_name] [partitions: $partition_name] [expr: $method_expr]"
-        methods: "Query,Search,Delete"
-  connectionCheckIntervalSeconds: 120 # the interval time(in seconds) for connection manager to scan inactive client info
-  connectionClientInfoTTLSeconds: 86400 # inactive client info TTL duration, in seconds
-  maxConnectionNum: 10000 # the max client info numbers that proxy should manage, avoid too many client infos
-  gracefulStopTimeout: 30 # seconds. force stop node without graceful stop
-  slowQuerySpanInSeconds: 5 # query whose executed time exceeds the `slowQuerySpanInSeconds` can be considered slow, in seconds.
-  http:
-    enabled: true # Whether to enable the http server
-    debug_mode: false # Whether to enable http server debug mode
-    port: # high-level restful api
-    acceptTypeAllowInt64: true # high-level restful api, whether http client can deal with int64
-    enablePprof: true # Whether to enable pprof middleware on the metrics port
-  ip: # if not specified, use the first unicastable address
-  port: 19530
-  internalPort: 19529
-  grpc:
-    serverMaxSendSize: 268435456
-    serverMaxRecvSize: 67108864
-    clientMaxSendSize: 268435456
-    clientMaxRecvSize: 67108864
-
-# Related configuration of queryCoord, used to manage topology and load balancing for the query nodes, and handoff from growing segments to sealed segments.
-queryCoord:
-  taskMergeCap: 1
-  taskExecutionCap: 256
-  autoHandoff: true # Enable auto handoff
-  autoBalance: true # Enable auto balance
-  autoBalanceChannel: true # Enable auto balance channel
-  balancer: ScoreBasedBalancer # auto balancer used for segments on queryNodes
-  globalRowCountFactor: 0.1 # the weight used when balancing segments among queryNodes
-  scoreUnbalanceTolerationFactor: 0.05 # the least value for unbalanced extent between from and to nodes when doing balance
-  reverseUnBalanceTolerationFactor: 1.3 # the largest value for unbalanced extent between from and to nodes after doing balance
-  overloadedMemoryThresholdPercentage: 90 # The threshold percentage that memory overload
-  balanceIntervalSeconds: 60
-  memoryUsageMaxDifferencePercentage: 30
-  rowCountFactor: 0.4 # the row count weight used when balancing segments among queryNodes
-  segmentCountFactor: 0.4 # the segment count weight used when balancing segments among queryNodes
-  globalSegmentCountFactor: 0.1 # the segment count weight used when balancing segments among queryNodes
-  segmentCountMaxSteps: 50 # segment count based plan generator max steps
-  rowCountMaxSteps: 50 # segment count based plan generator max steps
-  randomMaxSteps: 10 # segment count based plan generator max steps
-  growingRowCountWeight: 4 # the memory weight of growing segment row count
-  balanceCostThreshold: 0.001 # the threshold of balance cost, if the difference of cluster's cost after executing the balance plan is less than this value, the plan will not be executed
-  checkSegmentInterval: 1000
-  checkChannelInterval: 1000
-  checkBalanceInterval: 10000
-  checkIndexInterval: 10000
-  channelTaskTimeout: 60000 # 1 minute
-  segmentTaskTimeout: 120000 # 2 minute
-  distPullInterval: 500
-  collectionObserverInterval: 200
-  checkExecutedFlagInterval: 100
-  heartbeatAvailableInterval: 10000 # 10s, Only QueryNodes which fetched heartbeats within the duration are available
-  loadTimeoutSeconds: 600
-  distRequestTimeout: 5000 # the request timeout for querycoord fetching data distribution from querynodes, in milliseconds
-  heatbeatWarningLag: 5000 # the lag value for querycoord report warning when last heartbeat is too old, in milliseconds
-  checkHandoffInterval: 5000
-  enableActiveStandby: false
-  checkInterval: 1000
-  checkHealthInterval: 3000 # 3s, the interval when query coord try to check health of query node
-  checkHealthRPCTimeout: 2000 # 100ms, the timeout of check health rpc to query node
-  brokerTimeout: 5000 # 5000ms, querycoord broker rpc timeout
-  collectionRecoverTimes: 3 # if collection recover times reach the limit during loading state, release it
-  observerTaskParallel: 16 # the parallel observer dispatcher task number
-  checkAutoBalanceConfigInterval: 10 # the interval of check auto balance config
-  checkNodeSessionInterval: 60 # the interval(in seconds) of check querynode cluster session
-  gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
-  enableStoppingBalance: true # whether enable stopping balance
-  channelExclusiveNodeFactor: 4 # the least node number for enable channel's exclusive mode
-  cleanExcludeSegmentInterval: 60 # the time duration of clean pipeline exclude segment which used for filter invalid data, in seconds
-  ip: # if not specified, use the first unicastable address
-  port: 19531
-  grpc:
-    serverMaxSendSize: 536870912
-    serverMaxRecvSize: 268435456
-    clientMaxSendSize: 268435456
-    clientMaxRecvSize: 536870912
-
-# Related configuration of queryNode, used to run hybrid search between vector and scalar data.
-queryNode:
-  stats:
-    publishInterval: 1000 # Interval for querynode to report node information (milliseconds)
-  segcore:
-    knowhereThreadPoolNumRatio: 4 # The number of threads in knowhere's thread pool. If disk is enabled, the pool size will multiply with knowhereThreadPoolNumRatio([1, 32]).
-    chunkRows: 128 # The number of vectors in a chunk.
-    interimIndex:
-      enableIndex: true # Enable segment build with index to accelerate vector search when segment is in growing or binlog.
-      nlist: 128 # temp index nlist, recommend to set sqrt(chunkRows), must smaller than chunkRows/8
-      nprobe: 16 # nprobe to search small index, based on your accuracy requirement, must smaller than nlist
-      memExpansionRate: 1.15 # extra memory needed by building interim index
-      buildParallelRate: 0.5 # the ratio of building interim index parallel matched with cpu num
-    knowhereScoreConsistency: false # Enable knowhere strong consistency score computation logic
-  loadMemoryUsageFactor: 1 # The multiply factor of calculating the memory usage while loading segments
-  enableDisk: false # enable querynode load disk index, and search on disk index
-  maxDiskUsagePercentage: 95
-  cache:
-    enabled: true
-    memoryLimit: 2147483648 # 2 GB, 2 * 1024 *1024 *1024
-    readAheadPolicy: willneed # The read ahead policy of chunk cache, options: `normal, random, sequential, willneed, dontneed`
-    # options: async, sync, disable.
-    # Specifies the necessity for warming up the chunk cache.
-    # 1. If set to "sync" or "async" the original vector data will be synchronously/asynchronously loaded into the
-    # chunk cache during the load process. This approach has the potential to substantially reduce query/search latency
-    # for a specific duration post-load, albeit accompanied by a concurrent increase in disk usage;
-    # 2. If set to "disable" original vector data will only be loaded into the chunk cache during search/query.
-    warmup: disable
-  mmap:
-    mmapEnabled: false # Enable mmap for loading data
-  lazyload:
-    enabled: false # Enable lazyload for loading data
-    waitTimeout: 30000 # max wait timeout duration in milliseconds before start to do lazyload search and retrieve
-    requestResourceTimeout: 5000 # max timeout in milliseconds for waiting request resource for lazy load, 5s by default
-    requestResourceRetryInterval: 2000 # retry interval in milliseconds for waiting request resource for lazy load, 2s by default
-    maxRetryTimes: 1 # max retry times for lazy load, 1 by default
-    maxEvictPerRetry: 1 # max evict count for lazy load, 1 by default
-  grouping:
-    enabled: true
-    maxNQ: 1000
-    topKMergeRatio: 20
-  scheduler:
-    receiveChanSize: 10240
-    unsolvedQueueSize: 10240
-    # maxReadConcurrentRatio is the concurrency ratio of read task (search task and query task).
-    # Max read concurrency would be the value of hardware.GetCPUNum * maxReadConcurrentRatio.
-    # It defaults to 2.0, which means max read concurrency would be the value of hardware.GetCPUNum * 2.
-    # Max read concurrency must greater than or equal to 1, and less than or equal to hardware.GetCPUNum * 100.
-    # (0, 100]
-    maxReadConcurrentRatio: 1
-    cpuRatio: 10 # ratio used to estimate read task cpu usage.
-    maxTimestampLag: 86400
-    scheduleReadPolicy:
-      # fifo: A FIFO queue support the schedule.
-      # user-task-polling:
-      # 	The user's tasks will be polled one by one and scheduled.
-      # 	Scheduling is fair on task granularity.
-      # 	The policy is based on the username for authentication.
-      # 	And an empty username is considered the same user.
-      # 	When there are no multi-users, the policy decay into FIFO"
-      name: fifo
-      taskQueueExpire: 60 # Control how long (many seconds) that queue retains since queue is empty
-      enableCrossUserGrouping: false # Enable Cross user grouping when using user-task-polling policy. (Disable it if user's task can not merge each other)
-      maxPendingTaskPerUser: 1024 # Max pending task per user in scheduler
-  dataSync:
-    flowGraph:
-      maxQueueLength: 16 # Maximum length of task queue in flowgraph
-      maxParallelism: 1024 # Maximum number of tasks executed in parallel in the flowgraph
-  enableSegmentPrune: false # use partition prune function on shard delegator
-  ip: # if not specified, use the first unicastable address
-  port: 21123
-  grpc:
-    serverMaxSendSize: 536870912
-    serverMaxRecvSize: 268435456
-    clientMaxSendSize: 268435456
-    clientMaxRecvSize: 536870912
-
-indexCoord:
-  bindIndexNodeMode:
-    enable: false
-    address: localhost:22930
-    withCred: false
-    nodeID: 0
-  segment:
-    minSegmentNumRowsToEnableIndex: 1024 # It's a threshold. When the segment num rows is less than this value, the segment will not be indexed
-
-indexNode:
-  scheduler:
-    buildParallel: 1
-  enableDisk: true # enable index node build disk vector index
-  maxDiskUsagePercentage: 95
-  ip: # if not specified, use the first unicastable address
-  port: 21121
-  grpc:
-    serverMaxSendSize: 536870912
-    serverMaxRecvSize: 268435456
-    clientMaxSendSize: 268435456
-    clientMaxRecvSize: 536870912
-
-dataCoord:
-  channel:
-    watchTimeoutInterval: 300 # Timeout on watching channels (in seconds). Datanode tickler update watch progress will reset timeout timer.
-    balanceWithRpc: true # Whether to enable balance with RPC, default to use etcd watch
-    legacyVersionWithoutRPCWatch: 2.4.1 # Datanodes <= this version are considered as legacy nodes, which doesn't have rpc based watch(). This is only used during rolling upgrade where legacy nodes won't get new channels
-    balanceSilentDuration: 300 # The duration after which the channel manager start background channel balancing
-    balanceInterval: 360 # The interval with which the channel manager check dml channel balance status
-    checkInterval: 1 # The interval in seconds with which the channel manager advances channel states
-    notifyChannelOperationTimeout: 5 # Timeout notifing channel operations (in seconds).
-  segment:
-    maxSize: 1024 # Maximum size of a segment in MB
-    diskSegmentMaxSize: 2048 # Maximum size of a segment in MB for collection which has Disk index
-    sealProportion: 0.12
-    assignmentExpiration: 2000 # The time of the assignment expiration in ms
-    allocLatestExpireAttempt: 200 # The time attempting to alloc latest lastExpire from rootCoord after restart
-    maxLife: 86400 # The max lifetime of segment in seconds, 24*60*60
-    # If a segment didn't accept dml records in maxIdleTime and the size of segment is greater than
-    # minSizeFromIdleToSealed, Milvus will automatically seal it.
-    # The max idle time of segment in seconds, 10*60.
-    maxIdleTime: 600
-    minSizeFromIdleToSealed: 16 # The min size in MB of segment which can be idle from sealed.
-    # The max number of binlog file for one segment, the segment will be sealed if
-    # the number of binlog file reaches to max value.
-    maxBinlogFileNumber: 32
-    smallProportion: 0.5 # The segment is considered as "small segment" when its # of rows is smaller than
-    # (smallProportion * segment max # of rows).
-    # A compaction will happen on small segments if the segment after compaction will have
-    compactableProportion: 0.85
-    # over (compactableProportion * segment max # of rows) rows.
-    # MUST BE GREATER THAN OR EQUAL TO <smallProportion>!!!
-    # During compaction, the size of segment # of rows is able to exceed segment max # of rows by (expansionRate-1) * 100%.
-    expansionRate: 1.25
-  autoUpgradeSegmentIndex: false # whether auto upgrade segment index to index engine's version
-  enableCompaction: true # Enable data segment compaction
-  compaction:
-    enableAutoCompaction: true
-    indexBasedCompaction: true
-    rpcTimeout: 10
-    maxParallelTaskNum: 10
-    workerMaxParallelTaskNum: 2
-    levelzero:
-      forceTrigger:
-        minSize: 8388608 # The minimum size in bytes to force trigger a LevelZero Compaction, default as 8MB
-        maxSize: 67108864 # The maxmum size in bytes to force trigger a LevelZero Compaction, default as 64MB
-        deltalogMinNum: 10 # The minimum number of deltalog files to force trigger a LevelZero Compaction
-        deltalogMaxNum: 30 # The maxmum number of deltalog files to force trigger a LevelZero Compaction, default as 30
-  enableGarbageCollection: true
-  gc:
-    interval: 3600 # gc interval in seconds
-    missingTolerance: 86400 # file meta missing tolerance duration in seconds, default to 24hr(1d)
-    dropTolerance: 10800 # file belongs to dropped entity tolerance duration in seconds. 3600
-    removeConcurrent: 32 # number of concurrent goroutines to remove dropped s3 objects
-    scanInterval: 168 # garbage collection scan residue interval in hours
-  enableActiveStandby: false
-  brokerTimeout: 5000 # 5000ms, dataCoord broker rpc timeout
-  autoBalance: true # Enable auto balance
-  checkAutoBalanceConfigInterval: 10 # the interval of check auto balance config
-  import:
-    filesPerPreImportTask: 2 # The maximum number of files allowed per pre-import task.
-    taskRetention: 10800 # The retention period in seconds for tasks in the Completed or Failed state.
-    maxSizeInMBPerImportTask: 6144 # To prevent generating of small segments, we will re-group imported files. This parameter represents the sum of file sizes in each group (each ImportTask).
-    scheduleInterval: 2 # The interval for scheduling import, measured in seconds.
-    checkIntervalHigh: 2 # The interval for checking import, measured in seconds, is set to a high frequency for the import checker.
-    checkIntervalLow: 120 # The interval for checking import, measured in seconds, is set to a low frequency for the import checker.
-    maxImportFileNumPerReq: 1024 # The maximum number of files allowed per single import request.
-    waitForIndex: true # Indicates whether the import operation waits for the completion of index building.
-  gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
-  ip: # if not specified, use the first unicastable address
-  port: 13333
-  grpc:
-    serverMaxSendSize: 536870912
-    serverMaxRecvSize: 268435456
-    clientMaxSendSize: 268435456
-    clientMaxRecvSize: 536870912
-
-dataNode:
-  dataSync:
-    flowGraph:
-      maxQueueLength: 16 # Maximum length of task queue in flowgraph
-      maxParallelism: 1024 # Maximum number of tasks executed in parallel in the flowgraph
-    maxParallelSyncMgrTasks: 256 # The max concurrent sync task number of datanode sync mgr globally
-    skipMode:
-      enable: true # Support skip some timetick message to reduce CPU usage
-      skipNum: 4 # Consume one for every n records skipped
-      coldTime: 60 # Turn on skip mode after there are only timetick msg for x seconds
-  segment:
-    insertBufSize: 16777216 # Max buffer size to flush for a single segment.
-    deleteBufBytes: 16777216 # Max buffer size in bytes to flush del for a single channel, default as 16MB
-    syncPeriod: 600 # The period to sync segments if buffer is not empty.
-  memory:
-    forceSyncEnable: true # Set true to force sync if memory usage is too high
-    forceSyncSegmentNum: 1 # number of segments to sync, segments with top largest buffer will be synced.
-    checkInterval: 3000 # the interval to check datanode memory usage, in milliseconds
-    forceSyncWatermark: 0.5 # memory watermark for standalone, upon reaching this watermark, segments will be synced.
-  timetick:
-    byRPC: true
-    interval: 500
-  channel:
-    # specify the size of global work pool of all channels
-    # if this parameter <= 0, will set it as the maximum number of CPUs that can be executing
-    # suggest to set it bigger on large collection numbers to avoid blocking
-    workPoolSize: -1
-    # specify the size of global work pool for channel checkpoint updating
-    # if this parameter <= 0, will set it as 10
-    updateChannelCheckpointMaxParallel: 10
-    updateChannelCheckpointInterval: 60 # the interval duration(in seconds) for datanode to update channel checkpoint of each channel
-    updateChannelCheckpointRPCTimeout: 20 # timeout in seconds for UpdateChannelCheckpoint RPC call
-    maxChannelCheckpointsPerPRC: 128 # The maximum number of channel checkpoints per UpdateChannelCheckpoint RPC.
-    channelCheckpointUpdateTickInSeconds: 10 # The frequency, in seconds, at which the channel checkpoint updater executes updates.
-  import:
-    maxConcurrentTaskNum: 16 # The maximum number of import/pre-import tasks allowed to run concurrently on a datanode.
-    maxImportFileSizeInGB: 16 # The maximum file size (in GB) for an import file, where an import file refers to either a Row-Based file or a set of Column-Based files.
-    readBufferSizeInMB: 16 # The data block size (in MB) read from chunk manager by the datanode during import.
-  compaction:
-    levelZeroBatchMemoryRatio: 0.05 # The minimal memory ratio of free memory for level zero compaction executing in batch mode
-  gracefulStopTimeout: 1800 # seconds. force stop node without graceful stop
-  ip: # if not specified, use the first unicastable address
-  port: 21124
-  grpc:
-    serverMaxSendSize: 536870912
-    serverMaxRecvSize: 268435456
-    clientMaxSendSize: 268435456
-    clientMaxRecvSize: 536870912
-
-# Configures the system log output.
-log:
-  level: info # Only supports debug, info, warn, error, panic, or fatal. Default 'info'.
-  file:
-    rootPath: # root dir path to put logs, default "" means no log file will print. please adjust in embedded Milvus: /tmp/milvus/logs
-    maxSize: 300 # MB
-    maxAge: 10 # Maximum time for log retention in day.
-    maxBackups: 20
-  format: text # text or json
-  stdout: true # Stdout enable or not
-
-grpc:
-  log:
-    level: WARNING
-  gracefulStopTimeout: 10 # second, time to wait graceful stop finish
-  client:
-    compressionEnabled: false
-    dialTimeout: 200
-    keepAliveTime: 10000
-    keepAliveTimeout: 20000
-    maxMaxAttempts: 10
-    initialBackoff: 0.2
-    maxBackoff: 10
-    minResetInterval: 1000
-    maxCancelError: 32
-    minSessionCheckInterval: 200
-
-# Configure the proxy tls enable.
-tls:
-  serverPemPath: configs/cert/server.pem
-  serverKeyPath: configs/cert/server.key
-  caPemPath: configs/cert/ca.pem
-
-common:
-  defaultPartitionName: _default # default partition name for a collection
-  defaultIndexName: _default_idx # default index name
-  entityExpiration: -1 # Entity expiration in seconds, CAUTION -1 means never expire
-  indexSliceSize: 16 # MB
-  threadCoreCoefficient:
-    highPriority: 10 # This parameter specify how many times the number of threads is the number of cores in high priority pool
-    middlePriority: 5 # This parameter specify how many times the number of threads is the number of cores in middle priority pool
-    lowPriority: 1 # This parameter specify how many times the number of threads is the number of cores in low priority pool
-  buildIndexThreadPoolRatio: 0.75
-  DiskIndex:
-    MaxDegree: 56
-    SearchListSize: 100
-    PQCodeBudgetGBRatio: 0.125
-    BuildNumThreadsRatio: 1
-    SearchCacheBudgetGBRatio: 0.1
-    LoadNumThreadRatio: 8
-    BeamWidthRatio: 4
-  gracefulTime: 5000 # milliseconds. it represents the interval (in ms) by which the request arrival time needs to be subtracted in the case of Bounded Consistency.
-  gracefulStopTimeout: 1800 # seconds. it will force quit the server if the graceful stop process is not completed during this time.
-  storageType: remote # please adjust in embedded Milvus: local, available values are [local, remote, opendal], value minio is deprecated, use remote instead
-  # Default value: auto
-  # Valid values: [auto, avx512, avx2, avx, sse4_2]
-  # This configuration is only used by querynode and indexnode, it selects CPU instruction set for Searching and Index-building.
-  simdType: auto
-  security:
-    authorizationEnabled: false
-    # The superusers will ignore some system check processes,
-    # like the old password verification when updating the credential
-    superUsers:
-    tlsMode: 0
-  session:
-    ttl: 30 # ttl value when session granting a lease to register service
-    retryTimes: 30 # retry times when session sending etcd requests
-  locks:
-    metrics:
-      enable: false # whether gather statistics for metrics locks
-    threshold:
-      info: 500 # minimum milliseconds for printing durations in info level
-      warn: 1000 # minimum milliseconds for printing durations in warn level
-  storage:
-    scheme: s3
-    enablev2: false
-  ttMsgEnabled: true # Whether the instance disable sending ts messages
-  traceLogMode: 0 # trace request info
-  bloomFilterSize: 100000 # bloom filter initial size
-  maxBloomFalsePositive: 0.001 # max false positive rate for bloom filter
-
-# QuotaConfig, configurations of Milvus quota and limits.
-# By default, we enable:
-#   1. TT protection;
-#   2. Memory protection.
-#   3. Disk quota protection.
-# You can enable:
-#   1. DML throughput limitation;
-#   2. DDL, DQL qps/rps limitation;
-#   3. DQL Queue length/latency protection;
-#   4. DQL result rate protection;
-# If necessary, you can also manually force to deny RW requests.
-quotaAndLimits:
-  enabled: true # `true` to enable quota and limits, `false` to disable.
-  # quotaCenterCollectInterval is the time interval that quotaCenter
-  # collects metrics from Proxies, Query cluster and Data cluster.
-  # seconds, (0 ~ 65536)
-  quotaCenterCollectInterval: 3
-  ddl:
-    enabled: false
-    collectionRate: -1 # qps, default no limit, rate for CreateCollection, DropCollection, LoadCollection, ReleaseCollection
-    partitionRate: -1 # qps, default no limit, rate for CreatePartition, DropPartition, LoadPartition, ReleasePartition
-    db:
-      collectionRate: -1 # qps of db level , default no limit, rate for CreateCollection, DropCollection, LoadCollection, ReleaseCollection
-      partitionRate: -1 # qps of db level, default no limit, rate for CreatePartition, DropPartition, LoadPartition, ReleasePartition
-  indexRate:
-    enabled: false
-    max: -1 # qps, default no limit, rate for CreateIndex, DropIndex
-    db:
-      max: -1 # qps of db level, default no limit, rate for CreateIndex, DropIndex
-  flushRate:
-    enabled: true
-    max: -1 # qps, default no limit, rate for flush
-    collection:
-      max: 0.1 # qps, default no limit, rate for flush at collection level.
-    db:
-      max: -1 # qps of db level, default no limit, rate for flush
-  compactionRate:
-    enabled: false
-    max: -1 # qps, default no limit, rate for manualCompaction
-    db:
-      max: -1 # qps of db level, default no limit, rate for manualCompaction
-  dml:
-    # dml limit rates, default no limit.
-    # The maximum rate will not be greater than max.
-    enabled: false
-    insertRate:
-      max: -1 # MB/s, default no limit
-      db:
-        max: -1 # MB/s, default no limit
-      collection:
-        max: -1 # MB/s, default no limit
-      partition:
-        max: -1 # MB/s, default no limit
-    upsertRate:
-      max: -1 # MB/s, default no limit
-      db:
-        max: -1 # MB/s, default no limit
-      collection:
-        max: -1 # MB/s, default no limit
-      partition:
-        max: -1 # MB/s, default no limit
-    deleteRate:
-      max: -1 # MB/s, default no limit
-      db:
-        max: -1 # MB/s, default no limit
-      collection:
-        max: -1 # MB/s, default no limit
-      partition:
-        max: -1 # MB/s, default no limit
-    bulkLoadRate:
-      max: -1 # MB/s, default no limit, not support yet. TODO: limit bulkLoad rate
-      db:
-        max: -1 # MB/s, default no limit, not support yet. TODO: limit db bulkLoad rate
-      collection:
-        max: -1 # MB/s, default no limit, not support yet. TODO: limit collection bulkLoad rate
-      partition:
-        max: -1 # MB/s, default no limit, not support yet. TODO: limit partition bulkLoad rate
-  dql:
-    # dql limit rates, default no limit.
-    # The maximum rate will not be greater than max.
-    enabled: false
-    searchRate:
-      max: -1 # vps (vectors per second), default no limit
-      db:
-        max: -1 # vps (vectors per second), default no limit
-      collection:
-        max: -1 # vps (vectors per second), default no limit
-      partition:
-        max: -1 # vps (vectors per second), default no limit
-    queryRate:
-      max: -1 # qps, default no limit
-      db:
-        max: -1 # qps, default no limit
-      collection:
-        max: -1 # qps, default no limit
-      partition:
-        max: -1 # qps, default no limit
-  limits:
-    maxCollectionNum: 65536
-    maxCollectionNumPerDB: 65536
-    maxInsertSize: -1 # maximum size of a single insert request, in bytes, -1 means no limit
-    maxResourceGroupNumOfQueryNode: 1024 # maximum number of resource groups of query nodes
-  limitWriting:
-    # forceDeny false means dml requests are allowed (except for some
-    # specific conditions, such as memory of nodes to water marker), true means always reject all dml requests.
-    forceDeny: false
-    ttProtection:
-      enabled: false
-      # maxTimeTickDelay indicates the backpressure for DML Operations.
-      # DML rates would be reduced according to the ratio of time tick delay to maxTimeTickDelay,
-      # if time tick delay is greater than maxTimeTickDelay, all DML requests would be rejected.
-      # seconds
-      maxTimeTickDelay: 300
-    memProtection:
-      # When memory usage > memoryHighWaterLevel, all dml requests would be rejected;
-      # When memoryLowWaterLevel < memory usage < memoryHighWaterLevel, reduce the dml rate;
-      # When memory usage < memoryLowWaterLevel, no action.
-      enabled: true
-      dataNodeMemoryLowWaterLevel: 0.85 # (0, 1], memoryLowWaterLevel in DataNodes
-      dataNodeMemoryHighWaterLevel: 0.95 # (0, 1], memoryHighWaterLevel in DataNodes
-      queryNodeMemoryLowWaterLevel: 0.85 # (0, 1], memoryLowWaterLevel in QueryNodes
-      queryNodeMemoryHighWaterLevel: 0.95 # (0, 1], memoryHighWaterLevel in QueryNodes
-    growingSegmentsSizeProtection:
-      # No action will be taken if the growing segments size is less than the low watermark.
-      # When the growing segments size exceeds the low watermark, the dml rate will be reduced,
-      # but the rate will not be lower than minRateRatio * dmlRate.
-      enabled: false
-      minRateRatio: 0.5
-      lowWaterLevel: 0.2
-      highWaterLevel: 0.4
-    diskProtection:
-      enabled: true # When the total file size of object storage is greater than `diskQuota`, all dml requests would be rejected;
-      diskQuota: -1 # MB, (0, +inf), default no limit
-      diskQuotaPerDB: -1 # MB, (0, +inf), default no limit
-      diskQuotaPerCollection: -1 # MB, (0, +inf), default no limit
-      diskQuotaPerPartition: -1 # MB, (0, +inf), default no limit
-  limitReading:
-    # forceDeny false means dql requests are allowed (except for some
-    # specific conditions, such as collection has been dropped), true means always reject all dql requests.
-    forceDeny: false
-    queueProtection:
-      enabled: false
-      # nqInQueueThreshold indicated that the system was under backpressure for Search/Query path.
-      # If NQ in any QueryNode's queue is greater than nqInQueueThreshold, search&query rates would gradually cool off
-      # until the NQ in queue no longer exceeds nqInQueueThreshold. We think of the NQ of query request as 1.
-      # int, default no limit
-      nqInQueueThreshold: -1
-      # queueLatencyThreshold indicated that the system was under backpressure for Search/Query path.
-      # If dql latency of queuing is greater than queueLatencyThreshold, search&query rates would gradually cool off
-      # until the latency of queuing no longer exceeds queueLatencyThreshold.
-      # The latency here refers to the averaged latency over a period of time.
-      # milliseconds, default no limit
-      queueLatencyThreshold: -1
-    resultProtection:
-      enabled: false
-      # maxReadResultRate indicated that the system was under backpressure for Search/Query path.
-      # If dql result rate is greater than maxReadResultRate, search&query rates would gradually cool off
-      # until the read result rate no longer exceeds maxReadResultRate.
-      # MB/s, default no limit
-      maxReadResultRate: -1
-      maxReadResultRatePerDB: -1
-      maxReadResultRatePerCollection: -1
-    # colOffSpeed is the speed of search&query rates cool off.
-    # (0, 1]
-    coolOffSpeed: 0.9
-
-trace:
-  # trace exporter type, default is stdout,
-  # optional values: ['noop','stdout', 'jaeger', 'otlp']
-  exporter: noop
-  # fraction of traceID based sampler,
-  # optional values: [0, 1]
-  # Fractions >= 1 will always sample. Fractions < 0 are treated as zero.
-  sampleFraction: 0
-  jaeger:
-    url: # when exporter is jaeger should set the jaeger's URL
-  otlp:
-    endpoint: # example: "127.0.0.1:4318"
-    secure: true
-
-#when using GPU indexing, Milvus will utilize a memory pool to avoid frequent memory allocation and deallocation.
-#here, you can set the size of the memory occupied by the memory pool, with the unit being MB.
-#note that there is a possibility of Milvus crashing when the actual memory demand exceeds the value set by maxMemSize.
-#if initMemSize and MaxMemSize both set zero,
-#milvus will automatically initialize half of the available GPU memory,
-#maxMemSize will the whole available GPU memory.
-gpu:
-  initMemSize: # Gpu Memory Pool init size
-  maxMemSize: # Gpu Memory Pool Max size
--- a/DocIndexRetriever/retrieval_tool.py
+++ b/DocIndexRetriever/retrieval_tool.py
@@ -42,7 +42,7 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
        next_data = {
            "input": inputs["input"],
            "messages": inputs["input"],
-            "embedding": [item["embedding"] for item in data["data"]],
+            "embedding": data,  # [item["embedding"] for item in data["data"]],
            "k": kwargs["k"] if "k" in kwargs else 4,
            "search_type": kwargs["search_type"] if "search_type" in kwargs else "similarity",
            "distance_threshold": kwargs["distance_threshold"] if "distance_threshold" in kwargs else None,
--- a/DocIndexRetriever/tests/test_compose_milvus_on_gaudi.sh
+++ b/DocIndexRetriever/tests/test_compose_milvus_on_gaudi.sh
@@ -1,146 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -e
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-export REGISTRY=${IMAGE_REPO}
-export TAG=${IMAGE_TAG}
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-
-function build_docker_images() {
-    echo "Building Docker Images...."
-    cd $WORKPATH/docker_image_build
-    if [ ! -d "GenAIComps" ] ; then
-        git clone --single-branch --branch "${opea_branch:-"main"}" https://github.com/opea-project/GenAIComps.git
-    fi
-    service_list="dataprep embedding retriever reranking doc-index-retriever"
-    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/tei-gaudi:1.5.0
-    docker pull quay.io/coreos/etcd:v3.5.5
-    docker pull minio/minio:RELEASE.2023-03-20T20-16-18Z
-    docker pull milvusdb/milvus:v2.4.6
-    docker images && sleep 1s
-
-    echo "Docker images built!"
-}
-
-function start_services() {
-    echo "Starting Docker Services...."
-    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
-    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
-    export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
-    export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
-    export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
-    export MILVUS_HOST=${ip_address}
-    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export MEGA_SERVICE_HOST_IP=${ip_address}
-    export EMBEDDING_SERVICE_HOST_IP=${ip_address}
-    export RETRIEVER_SERVICE_HOST_IP=${ip_address}
-    export RERANK_SERVICE_HOST_IP=${ip_address}
-    export LLM_SERVICE_HOST_IP=${ip_address}
-    export host_ip=${ip_address}
-    export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
-    export RERANK_TYPE="tei"
-    export LOGFLAG=true
-
-    # Start Docker Containers
-    docker compose -f compose_milvus.yaml up -d
-    sleep 2m
-    echo "Docker services started!"
-}
-
-function validate() {
-    local CONTENT="$1"
-    local EXPECTED_RESULT="$2"
-    local SERVICE_NAME="$3"
-
-    if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
-        echo "[ $SERVICE_NAME ] Content is as expected: $CONTENT."
-        echo 0
-    else
-        echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
-        echo 1
-    fi
-}
-
-function validate_megaservice() {
-    echo "===========Ingest data=================="
-    local CONTENT=$(http_proxy="" curl -X POST "http://${ip_address}:6007/v1/dataprep/ingest" \
-     -H "Content-Type: multipart/form-data" \
-     -F 'link_list=["https://opea.dev/"]')
-    local EXIT_CODE=$(validate "$CONTENT" "Data preparation succeeded" "dataprep-milvus-service-gaudi")
-    echo "$EXIT_CODE"
-    local EXIT_CODE="${EXIT_CODE:0-1}"
-    echo "return value is $EXIT_CODE"
-    if [ "$EXIT_CODE" == "1" ]; then
-        docker logs dataprep-milvus-server | tee -a ${LOG_PATH}/dataprep-milvus-service-gaudi.log
-        return 1
-    fi
-
-    # Curl the Mega Service
-    echo "================Testing retriever service: Text Request ================"
-    cd $WORKPATH/tests
-    local CONTENT=$(http_proxy="" curl http://${ip_address}:8889/v1/retrievaltool -X POST -H "Content-Type: application/json" -d '{
-     "text": "Explain the OPEA project?"
-    }')
-
-    local EXIT_CODE=$(validate "$CONTENT" "OPEA" "doc-index-retriever-service-gaudi")
-    echo "$EXIT_CODE"
-    local EXIT_CODE="${EXIT_CODE:0-1}"
-    echo "return value is $EXIT_CODE"
-    if [ "$EXIT_CODE" == "1" ]; then
-        echo "=============Embedding container log=================="
-        docker logs tei-embedding-gaudi-server | tee -a ${LOG_PATH}/doc-index-retriever-service-gaudi.log
-        echo "=============Retriever container log=================="
-        docker logs retriever-milvus-server | tee -a ${LOG_PATH}/doc-index-retriever-service-gaudi.log
-        echo "=============TEI Reranking log=================="
-        docker logs tei-reranking-gaudi-server | tee -a ${LOG_PATH}/doc-index-retriever-service-gaudi.log
-        echo "=============Reranking container log=================="
-        docker logs reranking-tei-gaudi-server | tee -a ${LOG_PATH}/doc-index-retriever-service-gaudi.log
-        echo "=============Doc-index-retriever container log=================="
-        docker logs doc-index-retriever-server | tee -a ${LOG_PATH}/doc-index-retriever-service-gaudi.log
-        exit 1
-    fi
-
-}
-
-function stop_docker() {
-    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    container_list=$(cat compose_milvus.yaml | grep container_name | cut -d':' -f2)
-    for container_name in $container_list; do
-	echo $container_name
-        cid=$(docker ps -aq --filter "name=$container_name")
-        echo "Stopping container $container_name"
-        if [[ ! -z "$cid" ]]; then docker rm $cid -f && sleep 1s; fi
-    done
-}
-
-function main() {
-
-    stop_docker
-    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    echo "Dump current docker ps"
-    docker ps
-    start_time=$(date +%s)
-    start_services
-    end_time=$(date +%s)
-    duration=$((end_time-start_time))
-    echo "Mega service start duration is $duration s"
-    validate_megaservice
-
-    stop_docker
-    echo y | docker system prune
-
-}
-
-main
--- a/DocIndexRetriever/tests/test_compose_milvus_on_xeon.sh
+++ b/DocIndexRetriever/tests/test_compose_milvus_on_xeon.sh
@@ -1,145 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -e
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-export REGISTRY=${IMAGE_REPO}
-export TAG=${IMAGE_TAG}
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-
-function build_docker_images() {
-    echo "Building Docker Images...."
-    cd $WORKPATH/docker_image_build
-    if [ ! -d "GenAIComps" ] ; then
-        git clone --single-branch --branch "${opea_branch:-"main"}" https://github.com/opea-project/GenAIComps.git
-    fi
-    service_list="dataprep embedding retriever reranking doc-index-retriever"
-    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull quay.io/coreos/etcd:v3.5.5
-    docker pull minio/minio:RELEASE.2023-03-20T20-16-18Z
-    docker pull milvusdb/milvus:v2.4.6
-    docker images && sleep 1s
-
-    echo "Docker images built!"
-}
-
-function start_services() {
-    echo "Starting Docker Services...."
-    cd $WORKPATH/docker_compose/intel/cpu/xeon
-    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
-    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
-    export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:6006"
-    export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
-    export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
-    export MILVUS_HOST=${ip_address}
-    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export MEGA_SERVICE_HOST_IP=${ip_address}
-    export EMBEDDING_SERVICE_HOST_IP=${ip_address}
-    export RETRIEVER_SERVICE_HOST_IP=${ip_address}
-    export RERANK_SERVICE_HOST_IP=${ip_address}
-    export LLM_SERVICE_HOST_IP=${ip_address}
-    export host_ip=${ip_address}
-    export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
-    export RERANK_TYPE="tei"
-    export LOGFLAG=true
-
-    # Start Docker Containers
-    docker compose -f compose_milvus.yaml up -d
-    sleep 2m
-    echo "Docker services started!"
-}
-
-function validate() {
-    local CONTENT="$1"
-    local EXPECTED_RESULT="$2"
-    local SERVICE_NAME="$3"
-
-    if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
-        echo "[ $SERVICE_NAME ] Content is as expected: $CONTENT."
-        echo 0
-    else
-        echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
-        echo 1
-    fi
-}
-
-function validate_megaservice() {
-    echo "===========Ingest data=================="
-    local CONTENT=$(http_proxy="" curl -X POST "http://${ip_address}:6007/v1/dataprep/ingest" \
-     -H "Content-Type: multipart/form-data" \
-     -F 'link_list=["https://opea.dev/"]')
-    local EXIT_CODE=$(validate "$CONTENT" "Data preparation succeeded" "dataprep-milvus-service-xeon")
-    echo "$EXIT_CODE"
-    local EXIT_CODE="${EXIT_CODE:0-1}"
-    echo "return value is $EXIT_CODE"
-    if [ "$EXIT_CODE" == "1" ]; then
-        docker logs dataprep-milvus-server | tee -a ${LOG_PATH}/dataprep-milvus-service-xeon.log
-        return 1
-    fi
-
-    # Curl the Mega Service
-    echo "================Testing retriever service: Text Request ================"
-    cd $WORKPATH/tests
-    local CONTENT=$(http_proxy="" curl http://${ip_address}:8889/v1/retrievaltool -X POST -H "Content-Type: application/json" -d '{
-     "text": "Explain the OPEA project?"
-    }')
-
-    local EXIT_CODE=$(validate "$CONTENT" "OPEA" "doc-index-retriever-service-xeon")
-    echo "$EXIT_CODE"
-    local EXIT_CODE="${EXIT_CODE:0-1}"
-    echo "return value is $EXIT_CODE"
-    if [ "$EXIT_CODE" == "1" ]; then
-        echo "=============Embedding container log=================="
-        docker logs embedding-server | tee -a ${LOG_PATH}/doc-index-retriever-service-xeon.log
-        echo "=============Retriever container log=================="
-        docker logs retriever-milvus-server | tee -a ${LOG_PATH}/doc-index-retriever-service-xeon.log
-        echo "=============TEI Reranking log=================="
-        docker logs tei-reranking-server | tee -a ${LOG_PATH}/doc-index-retriever-service-xeon.log
-        echo "=============Reranking container log=================="
-        docker logs reranking-tei-xeon-server | tee -a ${LOG_PATH}/doc-index-retriever-service-xeon.log
-        echo "=============Doc-index-retriever container log=================="
-        docker logs doc-index-retriever-server | tee -a ${LOG_PATH}/doc-index-retriever-service-xeon.log
-        exit 1
-    fi
-
-}
-
-function stop_docker() {
-    cd $WORKPATH/docker_compose/intel/cpu/xeon
-    container_list=$(cat compose_milvus.yaml | grep container_name | cut -d':' -f2)
-    for container_name in $container_list; do
-	echo $container_name
-        cid=$(docker ps -aq --filter "name=$container_name")
-        echo "Stopping container $container_name"
-        if [[ ! -z "$cid" ]]; then docker rm $cid -f && sleep 1s; fi
-    done
-}
-
-function main() {
-
-    stop_docker
-    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    echo "Dump current docker ps"
-    docker ps
-    start_time=$(date +%s)
-    start_services
-    end_time=$(date +%s)
-    duration=$((end_time-start_time))
-    echo "Mega service start duration is $duration s"
-    validate_megaservice
-
-    stop_docker
-    echo y | docker system prune
-
-}
-
-main
--- a/DocIndexRetriever/tests/test_compose_on_xeon.sh
+++ b/DocIndexRetriever/tests/test_compose_on_xeon.sh
@@ -24,7 +24,7 @@ function build_docker_images() {
    service_list="dataprep embedding retriever reranking doc-index-retriever"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    docker pull redis/redis-stack:7.2.0-v9
    docker images && sleep 1s

--- a/DocIndexRetriever/tests/test_compose_without_rerank_on_xeon.sh
+++ b/DocIndexRetriever/tests/test_compose_without_rerank_on_xeon.sh
@@ -24,7 +24,7 @@ function build_docker_images() {
    service_list="dataprep embedding retriever doc-index-retriever"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    docker pull redis/redis-stack:7.2.0-v9
    docker images && sleep 1s

--- a/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -33,7 +33,7 @@ services:
      retries: 20
      start_period: 3s
  tei-embedding-serving:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-serving
    entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${EMBEDDING_MODEL_ID} --auto-truncate"
    ports:
--- a/GraphRAG/tests/test_compose_on_gaudi.sh
+++ b/GraphRAG/tests/test_compose_on_gaudi.sh
@@ -35,7 +35,7 @@ function build_docker_images() {
    docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log

    docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    docker images && sleep 1s
 }

--- a/MultimodalQnA/Dockerfile
+++ b/MultimodalQnA/Dockerfile
@@ -1,9 +1,48 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-ARG BASE_TAG=latest
-FROM opea/comps-base:$BASE_TAG
+# Stage 1: base setup used by other stages
+FROM python:3.11-slim AS base
+
+# get security updates
+RUN apt-get update && apt-get upgrade -y && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+ENV HOME=/home/user
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p $HOME && \
+    chown -R user $HOME
+
+WORKDIR $HOME
+
+# Stage 2: latest GenAIComps sources
+FROM base AS git
+
+RUN apt-get update && apt-get install -y --no-install-recommends git
+RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
+
+# Stage 3: common layer shared by services using GenAIComps
+FROM base AS comps-base
+
+# copy just relevant parts
+COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
+COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
+
+WORKDIR $HOME/GenAIComps
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
+WORKDIR $HOME
+
+ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
+
+USER user
+
+
+# Stage 4: unique part
+FROM comps-base

 COPY ./multimodalqna.py $HOME/multimodalqna.py

 ENTRYPOINT ["python", "multimodalqna.py"]
+# ENTRYPOINT ["/usr/bin/sleep", "infinity"]
--- a/MultimodalQnA/README.md
+++ b/MultimodalQnA/README.md
@@ -90,7 +90,7 @@ In the below, we provide a table that describes for each microservice component
 | MicroService | Open Source Project   | HW    | Port | Endpoint                                                    |
 | ------------ | --------------------- | ----- | ---- | ----------------------------------------------------------- |
 | Embedding    | Langchain             | Xeon  | 6000 | /v1/embeddings                                              |
-| Retriever    | Langchain, Redis      | Xeon  | 7000 | /v1/retrieval                                               |
+| Retriever    | Langchain, Redis      | Xeon  | 7000 | /v1/multimodal_retrieval                                    |
 | LVM          | Langchain, TGI        | Gaudi | 9399 | /v1/lvm                                                     |
 | Dataprep     | Redis, Langchain, TGI | Gaudi | 6007 | /v1/generate_transcripts, /v1/generate_captions, /v1/ingest |

--- a/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md
@@ -178,7 +178,7 @@ curl http://${host_ip}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \

 ```bash
 export your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)")
-curl http://${host_ip}:7000/v1/retrieval \
+curl http://${host_ip}:7000/v1/multimodal_retrieval \
    -X POST \
    -H "Content-Type: application/json" \
    -d "{\"text\":\"test\",\"embedding\":${your_embedding}}"
--- a/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md
@@ -264,7 +264,7 @@ curl http://${host_ip}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \

 ```bash
 export your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)")
-curl http://${host_ip}:${REDIS_RETRIEVER_PORT}/v1/retrieval \
+curl http://${host_ip}:${REDIS_RETRIEVER_PORT}/v1/multimodal_retrieval \
    -X POST \
    -H "Content-Type: application/json" \
    -d "{\"text\":\"test\",\"embedding\":${your_embedding}}"
--- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -210,7 +210,7 @@ curl http://${host_ip}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \

 ```bash
 export your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)")
-curl http://${host_ip}:7000/v1/retrieval \
+curl http://${host_ip}:7000/v1/multimodal_retrieval \
    -X POST \
    -H "Content-Type: application/json" \
    -d "{\"text\":\"test\",\"embedding\":${your_embedding}}"
@@ -336,7 +336,6 @@ To delete all uploaded files along with data indexed with `$INDEX_NAME` in REDIS
 ```bash
 curl -X POST \
    -H "Content-Type: application/json" \
-    -d '{"file_path": "all"}' \
    ${DATAPREP_DELETE_FILE_ENDPOINT}
 ```

--- a/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
+++ b/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
@@ -81,6 +81,13 @@ cd GenAIExamples/CodeGen
 docker build --no-cache -t opea/codegen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
 ```

+#### 8.4 Build FAQGen Megaservice Docker Images
+
+```bash
+cd GenAIExamples/FaqGen
+docker build --no-cache -t opea/faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+
 ### 9. Build UI Docker Image

 Build frontend Docker image that enables via below command:
@@ -152,6 +159,7 @@ export TGI_LLM_ENDPOINT_FAQGEN="http://${host_ip}:9009"
 export TGI_LLM_ENDPOINT_DOCSUM="http://${host_ip}:9009"
 export BACKEND_SERVICE_ENDPOINT_CHATQNA="http://${host_ip}:8888/v1/chatqna"
 export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:5000/v1/dataprep/delete"
+export BACKEND_SERVICE_ENDPOINT_FAQGEN="http://${host_ip}:8889/v1/faqgen"
 export BACKEND_SERVICE_ENDPOINT_CODEGEN="http://${host_ip}:7778/v1/codegen"
 export BACKEND_SERVICE_ENDPOINT_DOCSUM="http://${host_ip}:8890/v1/docsum"
 export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:5000/v1/dataprep/ingest"
@@ -308,7 +316,15 @@ Please refer to **[keycloak_setup_guide](keycloak_setup_guide.md)** for more det
         }'
    ```

-13. DocSum MegaService
+13. FAQGen MegaService
+
+    ```bash
+    curl http://${host_ip}:8889/v1/faqgen -H "Content-Type: application/json" -d '{
+         "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
+         }'
+    ```
+
+14. DocSum MegaService

    ```bash
    curl http://${host_ip}:8890/v1/docsum -H "Content-Type: application/json" -d '{
@@ -316,7 +332,7 @@ Please refer to **[keycloak_setup_guide](keycloak_setup_guide.md)** for more det
         }'
    ```

-14. CodeGen MegaService
+15. CodeGen MegaService

    ```bash
    curl http://${host_ip}:7778/v1/codegen -H "Content-Type: application/json" -d '{
@@ -324,7 +340,7 @@ Please refer to **[keycloak_setup_guide](keycloak_setup_guide.md)** for more det
         }'
    ```

-15. Dataprep Microservice
+16. Dataprep Microservice

    If you want to update the default knowledge base, you can use the following commands:

@@ -374,7 +390,7 @@ Please refer to **[keycloak_setup_guide](keycloak_setup_guide.md)** for more det
         -H "Content-Type: application/json"
    ```

-16. Prompt Registry Microservice
+17. Prompt Registry Microservice

    If you want to update the default Prompts in the application for your user, you can use the following commands:

@@ -417,7 +433,7 @@ Please refer to **[keycloak_setup_guide](keycloak_setup_guide.md)** for more det
      "user": "test", "prompt_id":"{prompt_id to be deleted}"}'
    ```

-17. Chat History Microservice
+18. Chat History Microservice

    To validate the chatHistory Microservice, you can use the following commands.

--- a/ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ProductivitySuite/docker_compose/intel/cpu/xeon/compose.yaml
@@ -33,7 +33,7 @@ services:
      DATAPREP_TYPE: ${DATAPREP_TYPE}
      LOGFLAG: ${LOGFLAG}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${EMBEDDING_MODEL_ID} --auto-truncate"
    container_name: tei-embedding-server
    ports:
@@ -89,7 +89,7 @@ services:
      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
    restart: unless-stopped
  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${RERANK_MODEL_ID} --auto-truncate"
    container_name: tei-reranking-server
    ports:
@@ -280,6 +280,24 @@ services:
      FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME}
      LOGFLAG: ${LOGFLAG:-False}
    restart: unless-stopped
+  faqgen-xeon-backend-server:
+    image: ${REGISTRY:-opea}/faqgen:${TAG:-latest}
+    container_name: faqgen-xeon-backend-server
+    depends_on:
+      - tgi_service
+      - llm_faqgen
+    ports:
+      - "8889:8888"
+    environment:
+      no_proxy: ${no_proxy}
+      https_proxy: ${https_proxy}
+      http_proxy: ${http_proxy}
+      MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP}
+      LLM_SERVICE_PORT: ${LLM_SERVICE_HOST_PORT_FAQGEN}
+      LLM_SERVICE_HOST_IP: ${LLM_SERVICE_HOST_IP_FAQGEN}
+      LOGFLAG: ${LOGFLAG}
+    ipc: host
+    restart: always
  mongo:
    image: mongo:7.0.11
    container_name: mongodb
@@ -344,6 +362,7 @@ services:
      - APP_BACKEND_SERVICE_ENDPOINT_CHATQNA=${BACKEND_SERVICE_ENDPOINT_CHATQNA}
      - APP_BACKEND_SERVICE_ENDPOINT_CODEGEN=${BACKEND_SERVICE_ENDPOINT_CODEGEN}
      - APP_BACKEND_SERVICE_ENDPOINT_DOCSUM=${BACKEND_SERVICE_ENDPOINT_DOCSUM}
+      - APP_BACKEND_SERVICE_ENDPOINT_FAQGEN=${BACKEND_SERVICE_ENDPOINT_FAQGEN}
      - APP_DATAPREP_SERVICE_ENDPOINT=${DATAPREP_SERVICE_ENDPOINT}
      - APP_DATAPREP_GET_FILE_ENDPOINT=${DATAPREP_GET_FILE_ENDPOINT}
      - APP_DATAPREP_DELETE_FILE_ENDPOINT=${DATAPREP_DELETE_FILE_ENDPOINT}
--- a/ProductivitySuite/docker_compose/intel/cpu/xeon/set_env.sh
+++ b/ProductivitySuite/docker_compose/intel/cpu/xeon/set_env.sh
@@ -33,6 +33,7 @@ export TGI_LLM_ENDPOINT_FAQGEN="http://${host_ip}:9009"
 export TGI_LLM_ENDPOINT_DOCSUM="http://${host_ip}:9009"
 export BACKEND_SERVICE_ENDPOINT_CHATQNA="http://${host_ip}:8888/v1/chatqna"
 export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:5000/v1/dataprep/delete"
+export BACKEND_SERVICE_ENDPOINT_FAQGEN="http://${host_ip}:8889/v1/faqgen"
 export BACKEND_SERVICE_ENDPOINT_CODEGEN="http://${host_ip}:7778/v1/codegen"
 export BACKEND_SERVICE_ENDPOINT_DOCSUM="http://${host_ip}:8890/v1/docsum"
 export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:5000/v1/dataprep/ingest"
--- a/ProductivitySuite/docker_image_build/build.yaml
+++ b/ProductivitySuite/docker_image_build/build.yaml
@@ -68,12 +68,18 @@ services:
      context: ../../CodeGen/
      dockerfile: ./Dockerfile
    image: ${REGISTRY:-opea}/codegen:${TAG:-latest}
-  llm-faqgen:
+  faqgen:
    build:
      args:
        http_proxy: ${http_proxy}
        https_proxy: ${https_proxy}
        no_proxy: ${no_proxy}
+      context: ../../FaqGen/
+      dockerfile: ./Dockerfile
+    image: ${REGISTRY:-opea}/faqgen:${TAG:-latest}
+  llm-faqgen:
+    build:
      context: GenAIComps
      dockerfile: comps/llms/src/faq-generation/Dockerfile
+    extends: faqgen
    image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
--- a/ProductivitySuite/tests/test_compose_on_xeon.sh
+++ b/ProductivitySuite/tests/test_compose_on_xeon.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
    docker images && sleep 1s
 }
@@ -58,6 +58,7 @@ function start_services() {
    export TGI_LLM_ENDPOINT_FAQGEN="http://${ip_address}:9009"
    export TGI_LLM_ENDPOINT_DOCSUM="http://${ip_address}:9009"
    export BACKEND_SERVICE_ENDPOINT_CHATQNA="http://${ip_address}:8888/v1/chatqna"
+    export BACKEND_SERVICE_ENDPOINT_FAQGEN="http://${ip_address}:8889/v1/faqgen"
    export DATAPREP_DELETE_FILE_ENDPOINT="http://${ip_address}:5000/v1/dataprep/delete"
    export BACKEND_SERVICE_ENDPOINT_CODEGEN="http://${ip_address}:7778/v1/codegen"
    export DATAPREP_SERVICE_ENDPOINT="http://${ip_address}:5000/v1/dataprep/ingest"
@@ -115,6 +116,9 @@ function validate_service() {
        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
    elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
+    elif [[ $SERVICE_NAME == *"faqgen-xeon-backend-server"* ]]; then
+        local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "$INPUT_DATA" -F "max_tokens=32" -F "stream=False" -H 'Content-Type: multipart/form-data' "$URL")
    else
        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
    fi
@@ -141,6 +145,33 @@ function validate_service() {
    sleep 1s
 }

+function validate_faqgen_megaservice() {
+    local URL="$1"
+    local SERVICE_NAME="$2"
+    local DOCKER_NAME="$3"
+    local EXPECTED_RESULT="Embeddings"
+    local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
+    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$INPUT_DATA" -F "max_tokens=32" -F "stream=False" -H 'Content-Type: multipart/form-data' "$URL")
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+        local CONTENT=$(curl -s -X POST -F "$INPUT_DATA"  -F "max_tokens=32" -F "stream=False" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+            echo "[ $SERVICE_NAME ] Content is as expected."
+        else
+            echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+            exit 1
+        fi
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+        exit 1
+    fi
+    sleep 1s
+}
+
 function validate_faqgen() {
    local URL="$1"
    local EXPECTED_RESULT="$2"
@@ -320,6 +351,12 @@ function validate_megaservice() {
        "chatqna-xeon-backend-server" \
        '{"messages": "What is the revenue of Nike in 2023?"}'\

+    # Curl the FAQGenMega Service
+    validate_faqgen_megaservice \
+        "${ip_address}:8889/v1/faqgen" \
+        "faqgen-xeon-backend-server" \
+        "faqgen-xeon-backend-server"
+
    # Curl the CodeGen Mega Service
    validate_service \
    "${ip_address}:7778/v1/codegen" \
--- a/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/SearchQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -61,7 +61,7 @@ services:
      GOOGLE_CSE_ID: ${GOOGLE_CSE_ID}
    restart: unless-stopped
  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${RERANK_MODEL_ID} --auto-truncate"
    container_name: tei-reranking-server
    ports:
--- a/SearchQnA/tests/test_compose_on_gaudi.sh
+++ b/SearchQnA/tests/test_compose_on_gaudi.sh
@@ -35,7 +35,7 @@ function build_docker_images() {
    service_list="searchqna searchqna-ui embedding web-retriever reranking llm-textgen"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    docker pull ghcr.io/huggingface/tei-gaudi:1.5.0
    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
    docker images && sleep 1s
--- a/Translation/ui/svelte/src/lib/shared/Network.ts
+++ b/Translation/ui/svelte/src/lib/shared/Network.ts
@@ -23,7 +23,8 @@ export async function fetchLanguageResponse(input: string, transform: string, tr
  payload = {
    language_from: transform,
    language_to: transTo,
-    source_language: input,
+    source_data: input,
+    translate_type: "text",
  };
  url = `${BASE_URL}`;
Author	SHA1	Message	Date
pre-commit-ci[bot]	99ffa4800e	[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci	2025-03-21 08:03:31 +00:00
WenjiaoYue	e2bd8f50af	update translation UI response format	2025-03-21 15:57:43 +08:00