Merge branch 'main' into replace_agent_ui

update ui style.
patch openwebui for opea agent.
2025-04-07 10:05:35 +08:00 · 2025-04-04 07:41:56 +00:00 · 2025-04-03 15:02:06 +00:00
474 changed files with 11151 additions and 23161 deletions
--- a/.github/ISSUE_TEMPLATE/1_bug_template.yml
+++ b/.github/ISSUE_TEMPLATE/1_bug_template.yml
@@ -32,7 +32,6 @@ body:
        - Mac
        - BSD
        - Other (Please let us know in description)
-        - N/A
    validations:
      required: true

@@ -57,7 +56,6 @@ body:
        - GPU-Nvidia
        - GPU-AMD
        - GPU-other (Please let us know in description)
-        - N/A
    validations:
      required: true

@@ -69,7 +67,6 @@ body:
        - label: Pull docker images from hub.docker.com
        - label: Build docker images from source
        - label: Other
-        - label: N/A
    validations:
      required: true

@@ -83,7 +80,6 @@ body:
        - label: Kubernetes Helm Charts
        - label: Kubernetes GMC
        - label: Other
-        - label: N/A
    validations:
      required: true

@@ -95,7 +91,6 @@ body:
        - Single Node
        - Multiple Nodes
        - Other
-        - N/A
      default: 0
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/2_feature_template.yml
+++ b/.github/ISSUE_TEMPLATE/2_feature_template.yml
@@ -32,7 +32,6 @@ body:
        - Mac
        - BSD
        - Other (Please let us know in description)
-        - N/A
    validations:
      required: true

@@ -57,7 +56,6 @@ body:
        - GPU-Nvidia
        - GPU-AMD
        - GPU-other (Please let us know in description)
-        - N/A
    validations:
      required: true

@@ -69,7 +67,6 @@ body:
        - Single Node
        - Multiple Nodes
        - Other
-        - N/A
      default: 0
    validations:
      required: true
--- a/.github/workflows/_build_comps_base_image.yml
+++ b/.github/workflows/_build_comps_base_image.yml
@@ -35,9 +35,9 @@ jobs:
      - name: Check if job should be skipped
        id: check-skip
        run: |
-          should_skip=true
-          if [[ "${{ inputs.node }}" == "gaudi" || "${{ inputs.node }}" == "xeon" ]]; then
-            should_skip=false
+          should_skip=false
+          if [[ "${{ inputs.node }}" == "gaudi3" || "${{ inputs.node }}" == "rocm" || "${{ inputs.node }}" == "arc" ]]; then
+            should_skip=true
          fi
          echo "should_skip=$should_skip"
          echo "should_skip=$should_skip" >> $GITHUB_OUTPUT
--- a/.github/workflows/_build_image.yml
+++ b/.github/workflows/_build_image.yml
@@ -42,9 +42,9 @@ jobs:
      - name: Check if job should be skipped
        id: check-skip
        run: |
-          should_skip=true
-          if [[ "${{ inputs.node }}" == "gaudi" || "${{ inputs.node }}" == "xeon" ]]; then
-            should_skip=false
+          should_skip=false
+          if [[ "${{ inputs.node }}" == "gaudi3" || "${{ inputs.node }}" == "rocm" || "${{ inputs.node }}" == "arc" ]]; then
+            should_skip=true
          fi
          echo "should_skip=$should_skip"
          echo "should_skip=$should_skip" >> $GITHUB_OUTPUT
@@ -77,13 +77,15 @@ jobs:
          docker_compose_path=${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml
          if [[ $(grep -c "vllm:" ${docker_compose_path}) != 0 ]]; then
              git clone https://github.com/vllm-project/vllm.git && cd vllm
-              VLLM_VER=v0.8.3
+              # Get the latest tag
+              VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
              echo "Check out vLLM tag ${VLLM_VER}"
              git checkout ${VLLM_VER} &> /dev/null && cd ../
          fi
          if [[ $(grep -c "vllm-gaudi:" ${docker_compose_path}) != 0 ]]; then
              git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork
-              VLLM_VER=v0.6.6.post1+Gaudi-1.20.0
+              # Get the latest tag
+              VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
              echo "Check out vLLM tag ${VLLM_VER}"
              git checkout ${VLLM_VER} &> /dev/null && cd ../
          fi
--- a/.github/workflows/_example-workflow.yml
+++ b/.github/workflows/_example-workflow.yml
@@ -76,7 +76,6 @@ jobs:
      example: ${{ inputs.example }}
      hardware: ${{ inputs.node }}
      use_model_cache: ${{ inputs.use_model_cache }}
-      opea_branch: ${{ inputs.opea_branch }}
    secrets: inherit


--- a/.github/workflows/_helm-e2e.yml
+++ b/.github/workflows/_helm-e2e.yml
@@ -81,10 +81,6 @@ jobs:
                if [[ "${{ inputs.hardware }}" == "gaudi" ]]; then
                  value_files="${value_files}\"${filename}\","
                fi
-              elif [[ "$filename" == *"rocm"* ]]; then
-                if [[ "${{ inputs.hardware }}" == "rocm" ]]; then
-                  value_files="${value_files}\"${filename}\","
-                fi
              elif [[ "$filename" == *"nv"* ]]; then
                continue
              else
--- a/.github/workflows/_run-docker-compose.yml
+++ b/.github/workflows/_run-docker-compose.yml
@@ -32,10 +32,6 @@ on:
        required: false
        type: boolean
        default: false
-      opea_branch:
-        default: "main"
-        required: false
-        type: string
 jobs:
  get-test-case:
    runs-on: ubuntu-latest
@@ -68,10 +64,8 @@ jobs:
          cd ${{ github.workspace }}/${{ inputs.example }}/tests
          run_test_cases=""

-          if [[ "${{ inputs.hardware }}" == "gaudi"* ]]; then
+          if [ "${{ inputs.hardware }}" == "gaudi2" ] || [ "${{ inputs.hardware }}" == "gaudi3" ]; then
            hardware="gaudi"
-          elif [[ "${{ inputs.hardware }}" == "xeon"* ]]; then
-            hardware="xeon"
          else
            hardware="${{ inputs.hardware }}"
          fi
@@ -122,17 +116,13 @@ jobs:
        run: |
          sudo rm -rf ${{github.workspace}}/* || true

-          echo "Cleaning up containers using ports..."
-          cid=$(docker ps --format '{{.Names}} : {{.Ports}}' | grep -v ' : $' | grep -v 0.0.0.0:5000 | awk -F' : ' '{print $1}')
+          # clean up containers use ports
+          cid=$(docker ps --format '{{.Names}} : {{.Ports}}' | grep -v ' : $' | grep -v 5000 | awk -F' : ' '{print $1}')
          if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
-          docker system prune -f

-          echo "Cleaning up images ..."
-          docker images --filter reference="*/*/*:latest" -q | xargs -r docker rmi && sleep 1s
-          docker images --filter reference="*/*:ci" -q | xargs -r docker rmi && sleep 1s
-          docker images --filter reference="*:5000/*/*" -q | xargs -r docker rmi && sleep 1s
-          docker images --filter reference="opea/comps-base" -q | xargs -r docker rmi && sleep 1s
-          docker images
+          docker system prune -f
+          docker rmi $(docker images --filter reference="*/*/*:latest" -q) || true
+          docker rmi $(docker images --filter reference="*/*:ci" -q) || true

      - name: Checkout out Repo
        uses: actions/checkout@v4
@@ -151,12 +141,6 @@ jobs:
          bash ${{ github.workspace }}/.github/workflows/scripts/docker_compose_clean_up.sh "ports"
          docker ps

-      - name: Log in DockerHub
-        uses: docker/login-action@v3.2.0
-        with:
-          username: ${{ secrets.DOCKERHUB_USER }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-
      - name: Run test
        shell: bash
        env:
@@ -169,11 +153,8 @@ jobs:
          SDK_BASE_URL: ${{ secrets.SDK_BASE_URL }}
          SERVING_TOKEN: ${{ secrets.SERVING_TOKEN }}
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          FINNHUB_API_KEY: ${{ secrets.FINNHUB_API_KEY }}
-          FINANCIAL_DATASETS_API_KEY: ${{ secrets.FINANCIAL_DATASETS_API_KEY }}
          IMAGE_REPO: ${{ inputs.registry }}
          IMAGE_TAG: ${{ inputs.tag }}
-          opea_branch: ${{ inputs.opea_branch }}
          example: ${{ inputs.example }}
          hardware: ${{ inputs.hardware }}
          test_case: ${{ matrix.test_case }}
@@ -186,38 +167,30 @@ jobs:
              export model_cache="/data2/hf_model"
            else
              echo "Model cache directory /data2/hf_model does not exist"
-              export model_cache="$HOME/.cache/huggingface/hub"
-            fi
-            if [[ "$test_case" == *"rocm"* ]]; then
-              export model_cache="/var/lib/GenAI/data"
+              export model_cache="~/.cache/huggingface/hub"
            fi
          fi
          if [ -f "${test_case}" ]; then timeout 60m bash "${test_case}"; else echo "Test script {${test_case}} not found, skip test!"; fi

      - name: Clean up container after test
-        if: always()
+        shell: bash
+        if: cancelled() || failure()
        run: |
-          set -x
+          cd ${{ github.workspace }}/${{ inputs.example }}
+          export test_case=${{ matrix.test_case }}
+          export hardware=${{ inputs.hardware }}
+          bash ${{ github.workspace }}/.github/workflows/scripts/docker_compose_clean_up.sh "containers"

-          echo "Cleaning up containers using ports..."
-          cid=$(docker ps --format '{{.Names}} : {{.Ports}}' | grep -v ' : $' | grep -v 0.0.0.0:5000 | awk -F' : ' '{print $1}')
+          # clean up containers use ports
+          cid=$(docker ps --format '{{.Names}} : {{.Ports}}' | grep -v ' : $' | grep -v 5000 | awk -F' : ' '{print $1}')
          if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi

-          echo "Cleaning up images ..."
-          if [[ "${{ inputs.hardware }}" == "xeon"* ]]; then
-              docker system prune -a -f
-          else
-              docker images --filter reference="*/*/*:latest" -q | xargs -r docker rmi && sleep 1s
-              docker images --filter reference="*/*:ci" -q | xargs -r docker rmi && sleep 1s
-              docker images --filter reference="*:5000/*/*" -q | xargs -r docker rmi && sleep 1s
-              docker images --filter reference="opea/comps-base" -q | xargs -r docker rmi && sleep 1s
-              docker system prune -f
-          fi
-          docker images
+          docker system prune -f
+          docker rmi $(docker images --filter reference="*:5000/*/*" -q) || true

      - name: Publish pipeline artifact
        if: ${{ !cancelled() }}
        uses: actions/upload-artifact@v4
        with:
-          name: ${{ inputs.hardware }}_${{ inputs.example }}_${{ matrix.test_case }}
+          name: ${{ inputs.example }}_${{ matrix.test_case }}
          path: ${{ github.workspace }}/${{ inputs.example }}/tests/*.log
--- a/.github/workflows/dockerhub-description.yml
+++ b/.github/workflows/dockerhub-description.yml
--- a/.github/workflows/manual-example-workflow.yml
+++ b/.github/workflows/manual-example-workflow.yml
@@ -7,7 +7,7 @@ on:
    inputs:
      nodes:
        default: "gaudi,xeon"
-        description: "Hardware to run test gaudi,xeon,rocm,arc,gaudi3,xeon-gnr"
+        description: "Hardware to run test gaudi,gaudi3,xeon,rocm,arc"
        required: true
        type: string
      examples:
--- a/.github/workflows/nightly-docker-build-publish.yml
+++ b/.github/workflows/nightly-docker-build-publish.yml
@@ -5,7 +5,7 @@ name: Nightly build/publish latest docker images

 on:
  schedule:
-    - cron: "30 14 * * 1-5" # UTC time
+    - cron: "30 14 * * *" # UTC time
  workflow_dispatch:

 env:
@@ -38,21 +38,8 @@ jobs:
    with:
      node: gaudi

-  build-images:
-    needs: [get-build-matrix, build-comps-base]
-    strategy:
-      matrix:
-        example: ${{ fromJSON(needs.get-build-matrix.outputs.examples_json) }}
-      fail-fast: false
-    uses: ./.github/workflows/_build_image.yml
-    with:
-      node: gaudi
-      example: ${{ matrix.example }}
-      inject_commit: true
-    secrets: inherit
-
-  test-example:
-    needs: [get-build-matrix]
+  build-and-test:
+    needs: get-build-matrix
    if: ${{ needs.get-build-matrix.outputs.examples_json != '' }}
    strategy:
      matrix:
@@ -60,22 +47,21 @@ jobs:
      fail-fast: false
    uses: ./.github/workflows/_example-workflow.yml
    with:
-      node: xeon
-      build: false
+      node: gaudi
      example: ${{ matrix.example }}
      test_compose: true
      inject_commit: true
    secrets: inherit

  get-image-list:
-    needs: [get-build-matrix]
+    needs: get-build-matrix
    uses: ./.github/workflows/_get-image-list.yml
    with:
      examples: ${{ needs.get-build-matrix.outputs.EXAMPLES }}

  publish:
-    needs: [get-build-matrix, get-image-list, build-images]
-    if: always()
+    needs: [get-build-matrix, get-image-list, build-and-test]
+    if: always() && ${{ needs.get-image-list.outputs.matrix != '' }}
    strategy:
      matrix:
        image: ${{ fromJSON(needs.get-image-list.outputs.matrix) }}
--- a/.github/workflows/pr-chart-e2e.yml
+++ b/.github/workflows/pr-chart-e2e.yml
@@ -46,8 +46,6 @@ jobs:
              example=$(echo "$values_file" | cut -d'/' -f1) # CodeGen
              if [[ "$valuefile" == *"gaudi"* ]]; then
                hardware="gaudi"
-              elif [[ "$valuefile" == *"rocm"* ]]; then
-                hardware="rocm"
              elif [[ "$valuefile" == *"nv"* ]]; then
                continue
              else
--- a/.github/workflows/weekly-example-test.yml
+++ b/.github/workflows/weekly-example-test.yml
@@ -1,55 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-name: Weekly test all examples on multiple HWs
-
-on:
-  schedule:
-    - cron: "30 2 * * 6" # UTC time
-  workflow_dispatch:
-
-env:
-  EXAMPLES: ${{ vars.NIGHTLY_RELEASE_EXAMPLES }}
-  NODES: "gaudi,xeon,rocm,arc"
-
-jobs:
-  get-test-matrix:
-    runs-on: ubuntu-latest
-    outputs:
-      examples: ${{ steps.get-matrix.outputs.examples }}
-      nodes: ${{ steps.get-matrix.outputs.nodes }}
-    steps:
-      - name: Create Matrix
-        id: get-matrix
-        run: |
-          examples=($(echo ${EXAMPLES} | tr ',' ' '))
-          examples_json=$(printf '%s\n' "${examples[@]}" | sort -u | jq -R '.' | jq -sc '.')
-          echo "examples=$examples_json" >> $GITHUB_OUTPUT
-          nodes=($(echo ${NODES} | tr ',' ' '))
-          nodes_json=$(printf '%s\n' "${nodes[@]}" | sort -u | jq -R '.' | jq -sc '.')
-          echo "nodes=$nodes_json" >> $GITHUB_OUTPUT
-
-  build-comps-base:
-    needs: [get-test-matrix]
-    strategy:
-      matrix:
-        node: ${{ fromJson(needs.get-test-matrix.outputs.nodes) }}
-    uses: ./.github/workflows/_build_comps_base_image.yml
-    with:
-      node: ${{ matrix.node }}
-
-  run-examples:
-    needs: [get-test-matrix, build-comps-base]
-    strategy:
-      matrix:
-        example: ${{ fromJson(needs.get-test-matrix.outputs.examples) }}
-        node: ${{ fromJson(needs.get-test-matrix.outputs.nodes) }}
-      fail-fast: false
-    uses: ./.github/workflows/_example-workflow.yml
-    with:
-      node: ${{ matrix.node }}
-      example: ${{ matrix.example }}
-      build: true
-      test_compose: true
-      test_helmchart: true
-    secrets: inherit
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -74,7 +74,7 @@ repos:
        name: Unused noqa

  - repo: https://github.com/pycqa/isort
-    rev: 6.0.1
+    rev: 5.13.2
    hooks:
      - id: isort

@@ -100,7 +100,7 @@ repos:
          - prettier@3.2.5

  - repo: https://github.com/psf/black.git
-    rev: 25.1.0
+    rev: 24.10.0
    hooks:
      - id: black
        files: (.*\.py)$
@@ -114,7 +114,7 @@ repos:
          - black==24.10.0

  - repo: https://github.com/codespell-project/codespell
-    rev: v2.4.1
+    rev: v2.3.0
    hooks:
      - id: codespell
        args: [-w]
@@ -122,7 +122,7 @@ repos:
          - tomli

  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.4
+    rev: v0.8.6
    hooks:
      - id: ruff
        args: [--fix, --exit-non-zero-on-fix, --no-cache]
--- a/AgentQnA/README.md
+++ b/AgentQnA/README.md
@@ -4,10 +4,9 @@

 1. [Overview](#overview)
 2. [Deploy with Docker](#deploy-with-docker)
-3. [How to interact with the agent system with UI](#how-to-interact-with-the-agent-system-with-ui)
+3. [Launch the UI](#launch-the-ui)
 4. [Validate Services](#validate-services)
 5. [Register Tools](#how-to-register-other-tools-with-the-ai-agent)
-6. [Monitoring and Tracing](#monitor-and-tracing)

 ## Overview

@@ -145,19 +144,21 @@ source $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon/set_env.sh

 ### 2. Launch the multi-agent system. </br>

-We make it convenient to launch the whole system with docker compose, which includes microservices for LLM, agents, UI, retrieval tool, vector database, dataprep, and telemetry. There are 3 docker compose files, which make it easy for users to pick and choose. Users can choose a different retrieval tool other than the `DocIndexRetriever` example provided in our GenAIExamples repo. Users can choose not to launch the telemetry containers.
+Two options are provided for the `llm_engine` of the agents: 1. open-source LLMs on Gaudi, 2. OpenAI models via API calls.

-#### Launch on Gaudi
+#### Gaudi

-On Gaudi, `meta-llama/Meta-Llama-3.3-70B-Instruct` will be served using vllm. The command below will launch the multi-agent system with the `DocIndexRetriever` as the retrieval tool for the Worker RAG agent.
+On Gaudi, `meta-llama/Meta-Llama-3.1-70B-Instruct` will be served using vllm.
+By default, both the RAG agent and SQL agent will be launched to support the React Agent.  
+The React Agent requires the DocIndexRetriever's [`compose.yaml`](../DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml) file, so two `compose.yaml` files need to be run with docker compose to start the multi-agent system.
+
+> **Note**: To enable the web search tool, skip this step and proceed to the "[Optional] Web Search Tool Support" section.

 ```bash
 cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/
 docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml up -d
 ```

-> **Note**: To enable the web search tool, skip this step and proceed to the "[Optional] Web Search Tool Support" section.
-
 To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
 Gaudi example with Open Telemetry feature:

@@ -182,9 +183,11 @@ docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/

 </details>

-#### Launch on Xeon
+#### Xeon

-On Xeon, only OpenAI models are supported. The command below will launch the multi-agent system with the `DocIndexRetriever` as the retrieval tool for the Worker RAG agent.
+On Xeon, only OpenAI models are supported.
+By default, both the RAG Agent and SQL Agent will be launched to support the React Agent.  
+The React Agent requires the DocIndexRetriever's [`compose.yaml`](../DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml) file, so two `compose yaml` files need to be run with docker compose to start the multi-agent system.

 ```bash
 export OPENAI_API_KEY=<your-openai-key>
@@ -203,19 +206,11 @@ bash run_ingest_data.sh

 > **Note**: This is a one-time operation.

-## How to interact with the agent system with UI
+## Launch the UI

-The UI microservice is launched in the previous step with the other microservices.
-To see the UI, open a web browser to `http://${ip_address}:5173` to access the UI. Note the `ip_address` here is the host IP of the UI microservice.
+Open a web browser to http://localhost:5173 to access the UI. Ensure the environment variable `AGENT_URL` is set to http://$ip_address:9090/v1/chat/completions in [ui/svelte/.env](./ui/svelte/.env) or else the UI may not work properly.

-1. `create Admin Account` with a random value
-2. add opea agent endpoint `http://$ip_address:9090/v1` which is a openai compatible api
-
-![opea-agent-setting](assets/img/opea-agent-setting.png)
-
-3. test opea agent with ui
-
-![opea-agent-test](assets/img/opea-agent-test.png)
+The AgentQnA UI can be deployed locally or using Docker. To customize deployment, refer to the [AgentQnA UI Guide](./ui/svelte/README.md).

 ## [Optional] Deploy using Helm Charts

@@ -254,8 +249,3 @@ python $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --agent_role "supervisor" -
 ## How to register other tools with the AI agent

 The [tools](./tools) folder contains YAML and Python files for additional tools for the supervisor and worker agents. Refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/src/README.md) to add tools and customize the AI agents.
-
-## Monitor and Tracing
-
-Follow [OpenTelemetry OPEA Guide](https://opea-project.github.io/latest/tutorial/OpenTelemetry/OpenTelemetry_OPEA_Guide.html) to understand how to use OpenTelemetry tracing and metrics in OPEA.  
-For AgentQnA specific tracing and metrics monitoring, follow [OpenTelemetry on AgentQnA](https://opea-project.github.io/latest/tutorial/OpenTelemetry/deploy/AgentQnA.html) section.
--- a/AgentQnA/assets/img/opea-agent-setting.png
+++ b/AgentQnA/assets/img/opea-agent-setting.png
--- a/AgentQnA/assets/img/opea-agent-test.png
+++ b/AgentQnA/assets/img/opea-agent-test.png
--- a/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -29,7 +29,7 @@ services:
    command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192

  worker-rag-agent:
-    image: ${REGISTRY:-opea}/agent:${TAG:-latest}
+    image: opea/agent:latest
    container_name: rag-agent-endpoint
    volumes:
      - "${TOOLSET_PATH}:/home/user/tools/"
@@ -60,7 +60,7 @@ services:
      port: 9095

  worker-sql-agent:
-    image: ${REGISTRY:-opea}/agent:${TAG:-latest}
+    image: opea/agent:latest
    container_name: sql-agent-endpoint
    volumes:
      - "${WORKDIR}/tests/Chinook_Sqlite.sqlite:/home/user/chinook-db/Chinook_Sqlite.sqlite:rw"
@@ -89,7 +89,7 @@ services:
      port: 9096

  supervisor-react-agent:
-    image: ${REGISTRY:-opea}/agent:${TAG:-latest}
+    image: opea/agent:latest
    container_name: react-agent-endpoint
    depends_on:
      - worker-rag-agent
--- a/AgentQnA/docker_compose/amd/gpu/rocm/compose_vllm.yaml
+++ b/AgentQnA/docker_compose/amd/gpu/rocm/compose_vllm.yaml
@@ -33,7 +33,7 @@ services:
    ipc: host

  worker-rag-agent:
-    image: ${REGISTRY:-opea}/agent:${TAG:-latest}
+    image: opea/agent:latest
    container_name: rag-agent-endpoint
    volumes:
      - ${TOOLSET_PATH}:/home/user/tools/
@@ -64,7 +64,7 @@ services:
      port: 9095

  worker-sql-agent:
-    image: ${REGISTRY:-opea}/agent:${TAG:-latest}
+    image: opea/agent:latest
    container_name: sql-agent-endpoint
    volumes:
      - "${WORKDIR}/tests/Chinook_Sqlite.sqlite:/home/user/chinook-db/Chinook_Sqlite.sqlite:rw"
@@ -93,7 +93,7 @@ services:
      port: 9096

  supervisor-react-agent:
-    image: ${REGISTRY:-opea}/agent:${TAG:-latest}
+    image: opea/agent:latest
    container_name: react-agent-endpoint
    depends_on:
      - worker-rag-agent
--- a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
@@ -103,8 +103,10 @@ services:
  agent-ui:
    image: opea/agent-ui
    container_name: agent-ui
+    volumes:
+      - ${WORKDIR}/GenAIExamples/AgentQnA/ui/svelte/.env:/home/user/svelte/.env # test db
    ports:
-      - "5173:8080"
+      - "5173:5173"
    ipc: host

 networks:
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -3,7 +3,7 @@

 services:
  worker-rag-agent:
-    image: ${REGISTRY:-opea}/agent:${TAG:-latest}
+    image: opea/agent:latest
    container_name: rag-agent-endpoint
    volumes:
      - ${TOOLSET_PATH}:/home/user/tools/
@@ -34,7 +34,7 @@ services:
      port: 9095

  worker-sql-agent:
-    image: ${REGISTRY:-opea}/agent:${TAG:-latest}
+    image: opea/agent:latest
    container_name: sql-agent-endpoint
    volumes:
      - ${WORKDIR}/GenAIExamples/AgentQnA/tests:/home/user/chinook-db # test db
@@ -63,7 +63,7 @@ services:
      port: 9096

  supervisor-react-agent:
-    image: ${REGISTRY:-opea}/agent:${TAG:-latest}
+    image: opea/agent:latest
    container_name: react-agent-endpoint
    depends_on:
      - worker-rag-agent
@@ -104,12 +104,14 @@ services:
      - "8080:8000"
    ipc: host
  agent-ui:
-    image: ${REGISTRY:-opea}/agent-ui:${TAG:-latest}
+    image: opea/agent-ui
    container_name: agent-ui
+    volumes:
+      - ${WORKDIR}/GenAIExamples/AgentQnA/ui/svelte/.env:/home/user/svelte/.env
    environment:
      host_ip: ${host_ip}
    ports:
-      - "5173:8080"
+      - "5173:5173"
    ipc: host
  vllm-service:
    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
@@ -117,7 +119,7 @@ services:
    ports:
      - "8086:8000"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
@@ -138,4 +140,4 @@ services:
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 4 --host 0.0.0.0 --port 8000 --block-size 128 --max-num-seqs 256 --max-seq-len-to-capture 16384
+    command: --model $LLM_MODEL_ID --tensor-parallel-size 4 --host 0.0.0.0 --port 8000 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 16384
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -42,7 +42,7 @@ if [ ! -f $WORKDIR/GenAIExamples/AgentQnA/tests/Chinook_Sqlite.sqlite ]; then
 fi

 # configure agent ui
-# echo "AGENT_URL = 'http://$ip_address:9090/v1/chat/completions'" | tee ${WORKDIR}/GenAIExamples/AgentQnA/ui/svelte/.env
+echo "AGENT_URL = 'http://$ip_address:9090/v1/chat/completions'" | tee ${WORKDIR}/GenAIExamples/AgentQnA/ui/svelte/.env

 # retriever
 export host_ip=$(hostname -I | awk '{print $1}')
--- a/AgentQnA/docker_image_build/build.yaml
+++ b/AgentQnA/docker_image_build/build.yaml
@@ -17,15 +17,12 @@ services:
      dockerfile: ./docker/Dockerfile
    extends: agent
    image: ${REGISTRY:-opea}/agent-ui:${TAG:-latest}
-  vllm-gaudi:
-    build:
-      context: vllm-fork
-      dockerfile: Dockerfile.hpu
-    extends: agent
-    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
  vllm-rocm:
    build:
+      args:
+        http_proxy: ${http_proxy}
+        https_proxy: ${https_proxy}
+        no_proxy: ${no_proxy}
      context: GenAIComps
      dockerfile: comps/third_parties/vllm/src/Dockerfile.amd_gpu
-    extends: agent
    image: ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest}
--- a/AgentQnA/kubernetes/helm/cpu-values.yaml
+++ b/AgentQnA/kubernetes/helm/cpu-values.yaml
@@ -1,22 +0,0 @@
-# Copyright (C) 2025 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-tgi:
-  enabled: false
-vllm:
-  enabled: true
-  LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
-  extraCmdArgs: ["--max-seq-len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
-
-supervisor:
-  llm_endpoint_url: http://{{ .Release.Name }}-vllm
-  llm_engine: vllm
-  model: "meta-llama/Meta-Llama-3-8B-Instruct"
-ragagent:
-  llm_endpoint_url: http://{{ .Release.Name }}-vllm
-  llm_engine: vllm
-  model: "meta-llama/Meta-Llama-3-8B-Instruct"
-sqlagent:
-  llm_endpoint_url: http://{{ .Release.Name }}-vllm
-  llm_engine: vllm
-  model: "meta-llama/Meta-Llama-3-8B-Instruct"
--- a/AgentQnA/kubernetes/helm/gaudi-values.yaml
+++ b/AgentQnA/kubernetes/helm/gaudi-values.yaml
@@ -4,32 +4,13 @@
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values

-tgi:
-  enabled: false
 vllm:
  enabled: true
-  accelDevice: "gaudi"
  image:
    repository: opea/vllm-gaudi
-  resources:
-    limits:
-      habana.ai/gaudi: 4
-  LLM_MODEL_ID: "meta-llama/Llama-3.3-70B-Instruct"
-  OMPI_MCA_btl_vader_single_copy_mechanism: none
-  PT_HPU_ENABLE_LAZY_COLLECTIVES: true
-  VLLM_SKIP_WARMUP: true
-  shmSize: 16Gi
-  extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq-len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
-
 supervisor:
  llm_endpoint_url: http://{{ .Release.Name }}-vllm
-  llm_engine: vllm
-  model: "meta-llama/Llama-3.3-70B-Instruct"
 ragagent:
  llm_endpoint_url: http://{{ .Release.Name }}-vllm
-  llm_engine: vllm
-  model: "meta-llama/Llama-3.3-70B-Instruct"
 sqlagent:
  llm_endpoint_url: http://{{ .Release.Name }}-vllm
-  llm_engine: vllm
-  model: "meta-llama/Llama-3.3-70B-Instruct"
--- a/AgentQnA/retrieval_tool/run_ingest_data.sh
+++ b/AgentQnA/retrieval_tool/run_ingest_data.sh
@@ -1,22 +1,7 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-host_ip=$(hostname -I | awk '{print $1}')
-port=6007
 FILEDIR=${WORKDIR}/GenAIExamples/AgentQnA/example_data/
 FILENAME=test_docs_music.jsonl

-# AgentQnA ingestion script requires following packages
-packages=("requests" "tqdm")
-
-# Check if packages are installed
-for package in "${packages[@]}"; do
-  if pip freeze | grep -q "$package="; then
-    echo "$package is installed"
-  else
-    echo "$package is not installed"
-    pip install --no-cache-dir "$package"
-  fi
-done
-
-python3 index_data.py --filedir ${FILEDIR} --filename ${FILENAME} --host_ip $host_ip --port $port
+python3 index_data.py --filedir ${FILEDIR} --filename ${FILENAME} --host_ip $host_ip
--- a/AgentQnA/tests/_test_compose_openai_on_xeon.sh
+++ b/AgentQnA/tests/_test_compose_openai_on_xeon.sh
@@ -31,7 +31,7 @@ function stop_retrieval_tool() {
 }

 echo "=================== #1 Building docker images===================="
-bash step1_build_images.sh xeon
+bash step1_build_images.sh
 echo "=================== #1 Building docker images completed===================="

 echo "=================== #2 Start retrieval tool===================="
--- a/AgentQnA/tests/step1_build_images.sh
+++ b/AgentQnA/tests/step1_build_images.sh
@@ -15,52 +15,42 @@ function get_genai_comps() {
    fi
 }

+
 function build_docker_images_for_retrieval_tool(){
    cd $WORKDIR/GenAIExamples/DocIndexRetriever/docker_image_build/
    get_genai_comps
    echo "Build all the images with --no-cache..."
-    docker compose -f build.yaml build --no-cache
+    service_list="doc-index-retriever dataprep embedding retriever reranking"
+    docker compose -f build.yaml build ${service_list} --no-cache
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+
    docker images && sleep 1s
 }

-function build_agent_docker_image_xeon() {
+function build_agent_docker_image() {
    cd $WORKDIR/GenAIExamples/AgentQnA/docker_image_build/
    get_genai_comps
-
    echo "Build agent image with --no-cache..."
-    service_list="agent agent-ui"
-    docker compose -f build.yaml build ${service_list} --no-cache
+    docker compose -f build.yaml build --no-cache
 }

-function build_agent_docker_image_gaudi_vllm() {
-    cd $WORKDIR/GenAIExamples/AgentQnA/docker_image_build/
-    get_genai_comps
-
-    git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork
-    VLLM_VER=v0.6.6.post1+Gaudi-1.20.0
-    git checkout ${VLLM_VER} &> /dev/null && cd ../
-
-    echo "Build agent image with --no-cache..."
-    service_list="agent agent-ui vllm-gaudi"
-    docker compose -f build.yaml build ${service_list} --no-cache
-}
-
-function build_agent_docker_image_rocm() {
-    cd $WORKDIR/GenAIExamples/AgentQnA/docker_image_build/
-    get_genai_comps
-
-    echo "Build agent image with --no-cache..."
-    service_list="agent agent-ui"
-    docker compose -f build.yaml build ${service_list} --no-cache
-}
-
-function build_agent_docker_image_rocm_vllm() {
-    cd $WORKDIR/GenAIExamples/AgentQnA/docker_image_build/
-    get_genai_comps
-
-    echo "Build agent image with --no-cache..."
-    service_list="agent agent-ui vllm-rocm"
-    docker compose -f build.yaml build ${service_list} --no-cache
+function build_vllm_docker_image() {
+    echo "Building the vllm docker image"
+    cd $WORKPATH
+    echo $WORKPATH
+    if [ ! -d "./vllm-fork" ]; then
+        git clone https://github.com/HabanaAI/vllm-fork.git
+    fi
+    cd ./vllm-fork
+    VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+    git checkout ${VLLM_VER} &> /dev/null
+    docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:ci --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
+    if [ $? -ne 0 ]; then
+        echo "opea/vllm-gaudi:ci failed"
+        exit 1
+    else
+        echo "opea/vllm-gaudi:ci successful"
+    fi
 }


@@ -69,32 +59,15 @@ function main() {
    build_docker_images_for_retrieval_tool
    echo "==================== Build docker images for retrieval tool completed ===================="

-    sleep 3s
+    echo "==================== Build agent docker image ===================="
+    build_agent_docker_image
+    echo "==================== Build agent docker image completed ===================="

-    case $1 in
-        "rocm")
-            echo "==================== Build agent docker image for ROCm ===================="
-            build_agent_docker_image_rocm
-            ;;
-        "rocm_vllm")
-            echo "==================== Build agent docker image for ROCm VLLM ===================="
-            build_agent_docker_image_rocm_vllm
-            ;;
-        "gaudi_vllm")
-            echo "==================== Build agent docker image for Gaudi ===================="
-            build_agent_docker_image_gaudi_vllm
-            ;;
-        "xeon")
-            echo "==================== Build agent docker image for Xeon ===================="
-            build_agent_docker_image_xeon
-            ;;
-        *)
-            echo "Invalid argument"
-            exit 1
-            ;;
-    esac
+    echo "==================== Build vllm docker image ===================="
+    build_vllm_docker_image
+    echo "==================== Build vllm docker image completed ===================="

    docker image ls | grep vllm
 }

-main $1
+main
--- a/AgentQnA/tests/step1_build_images_rocm_vllm.sh
+++ b/AgentQnA/tests/step1_build_images_rocm_vllm.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+export WORKPATH=$(dirname "$PWD")
+export WORKDIR=${WORKPATH}/../../
+echo "WORKDIR=${WORKDIR}"
+export ip_address=$(hostname -I | awk '{print $1}')
+
+
+function get_genai_comps() {
+    if [ ! -d "GenAIComps" ] ; then
+        git clone --depth 1 --branch ${opea_branch:-"main"} https://github.com/opea-project/GenAIComps.git
+    fi
+}
+
+
+function build_docker_images_for_retrieval_tool(){
+    cd $WORKPATH/../DocIndexRetriever/docker_image_build/
+    get_genai_comps
+    echo "Build all the images with --no-cache..."
+    service_list="doc-index-retriever dataprep embedding retriever reranking"
+    docker compose -f build.yaml build ${service_list} --no-cache
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+
+    docker images && sleep 3s
+}
+
+function build_agent_docker_image() {
+    cd $WORKPATH/docker_image_build/
+    get_genai_comps
+    echo "Build agent image with --no-cache..."
+    docker compose -f build.yaml build --no-cache
+
+    docker images && sleep 3s
+}
+
+#function build_vllm_docker_image() {
+#    echo "Building the vllm docker image"
+#    cd $WORKPATH/
+#    docker build --no-cache -t opea/llm-vllm-rocm:ci -f Dockerfile-vllm-rocm .
+#
+#    docker images && sleep 3s
+#}
+
+
+function main() {
+    echo "==================== Build docker images for retrieval tool ===================="
+    build_docker_images_for_retrieval_tool
+    echo "==================== Build docker images for retrieval tool completed ===================="
+
+    echo "==================== Build agent docker image ===================="
+    build_agent_docker_image
+    echo "==================== Build agent docker image completed ===================="
+
+#    echo "==================== Build vllm docker image ===================="
+#    build_vllm_docker_image
+#    echo "==================== Build vllm docker image completed ===================="
+
+    docker image ls | grep vllm
+}
+
+main
--- a/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
+++ b/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
@@ -8,8 +8,6 @@ WORKPATH=$(dirname "$PWD")
 export WORKDIR=$WORKPATH/../../
 echo "WORKDIR=${WORKDIR}"
 export ip_address=$(hostname -I | awk '{print $1}')
-export host_ip=$ip_address
-echo "ip_address=${ip_address}"
 export TOOLSET_PATH=$WORKPATH/tools/
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
@@ -26,12 +24,12 @@ ls $HF_CACHE_DIR
 vllm_port=8086
 vllm_volume=${HF_CACHE_DIR}

-
-function start_agent_service() {
-    echo "Starting agent service"
+function start_tgi(){
+    echo "Starting tgi-gaudi server"
    cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi
    source set_env.sh
-    docker compose -f compose.yaml up -d
+    docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml tgi_gaudi.yaml -f compose.telemetry.yaml up -d
+
 }

 function start_all_services() {
@@ -71,6 +69,7 @@ function download_chinook_data(){
    cp chinook-database/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite $WORKDIR/GenAIExamples/AgentQnA/tests/
 }

+
 function validate() {
    local CONTENT="$1"
    local EXPECTED_RESULT="$2"
@@ -139,6 +138,24 @@ function remove_chinook_data(){
    echo "Chinook data removed!"
 }

+export host_ip=$ip_address
+echo "ip_address=${ip_address}"
+
+
+function validate() {
+    local CONTENT="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+
+    if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+        echo "[ $SERVICE_NAME ] Content is as expected: $CONTENT"
+        echo 0
+    else
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+        echo 1
+    fi
+}
+
 function ingest_data_and_validate() {
    echo "Ingesting data"
    cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool/
--- a/AgentQnA/tests/test_compose_on_gaudi.sh
+++ b/AgentQnA/tests/test_compose_on_gaudi.sh
@@ -10,41 +10,31 @@ export ip_address=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
 export no_proxy="$no_proxy,rag-agent-endpoint,sql-agent-endpoint,react-agent-endpoint,agent-ui,vllm-gaudi-server,jaeger,grafana,prometheus,127.0.0.1,localhost,0.0.0.0,$ip_address"
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-export REGISTRY=${IMAGE_REPO}
-export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}


+function get_genai_comps() {
+    if [ ! -d "GenAIComps" ] ; then
+        git clone --depth 1 --branch ${opea_branch:-"main"} https://github.com/opea-project/GenAIComps.git
+    fi
+}
+
+
+function build_agent_docker_image() {
+    cd $WORKDIR/GenAIExamples/AgentQnA/docker_image_build/
+    get_genai_comps
+    echo "Build agent image with --no-cache..."
+    docker compose -f build.yaml build --no-cache
+}
+
 function stop_crag() {
    cid=$(docker ps -aq --filter "name=kdd-cup-24-crag-service")
    echo "Stopping container kdd-cup-24-crag-service with cid $cid"
    if [[ ! -z "$cid" ]]; then docker rm $cid -f && sleep 1s; fi
 }

-function stop_agent_containers() {
+function stop_agent_docker() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi/
-    container_list=$(cat compose.yaml | grep container_name | cut -d':' -f2)
-    for container_name in $container_list; do
-        cid=$(docker ps -aq --filter "name=$container_name")
-        echo "Stopping container $container_name"
-        if [[ ! -z "$cid" ]]; then docker rm $cid -f && sleep 1s; fi
-    done
-}
-
-function stop_telemetry_containers(){
-    cd $WORKPATH/docker_compose/intel/hpu/gaudi/
-    container_list=$(cat compose.telemetry.yaml | grep container_name | cut -d':' -f2)
-    for container_name in $container_list; do
-        cid=$(docker ps -aq --filter "name=$container_name")
-        echo "Stopping container $container_name"
-        if [[ ! -z "$cid" ]]; then docker rm $cid -f && sleep 1s; fi
-    done
-    container_list=$(cat compose.telemetry.yaml | grep container_name | cut -d':' -f2)
-
+    docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml down
 }

 function stop_llm(){
@@ -78,31 +68,25 @@ function stop_retrieval_tool() {
    done
 }
 echo "workpath: $WORKPATH"
-echo "::group::=================== Stop containers ===================="
-stop_llm
+echo "=================== Stop containers ===================="
 stop_crag
-stop_agent_containers
-stop_retrieval_tool
-stop_telemetry_containers
-echo "::endgroup::"
+stop_agent_docker

 cd $WORKPATH/tests

-echo "::group::=================== Building docker images===================="
-bash step1_build_images.sh gaudi_vllm > docker_image_build.log
-echo "::endgroup::"
+echo "=================== #1 Building docker images===================="
+build_agent_docker_image
+echo "=================== #1 Building docker images completed===================="

-echo "::group::=================== Start agent, API server, retrieval, and ingest data===================="
-bash step4_launch_and_validate_agent_gaudi.sh
-echo "::endgroup::"
+echo "=================== #4 Start agent, API server, retrieval, and ingest data===================="
+bash $WORKPATH/tests/step4_launch_and_validate_agent_gaudi.sh
+echo "=================== #4 Agent, retrieval test passed ===================="

-echo "::group::=================== Stop agent and API server===================="
-stop_llm
+echo "=================== #5 Stop agent and API server===================="
 stop_crag
-stop_agent_containers
-stop_retrieval_tool
-stop_telemetry_containers
+stop_agent_docker
+echo "=================== #5 Agent and API server stopped===================="
+
 echo y | docker system prune
-echo "::endgroup::"

 echo "ALL DONE!!"
--- a/AgentQnA/tests/test_compose_on_rocm.sh
+++ b/AgentQnA/tests/test_compose_on_rocm.sh
@@ -11,13 +11,7 @@ echo "WORKDIR=${WORKDIR}"
 export ip_address=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export TOOLSET_PATH=$WORKPATH/tools/
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-export REGISTRY=${IMAGE_REPO}
-export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}
+export MODEL_CACHE="./data"

 function stop_crag() {
    cid=$(docker ps -aq --filter "name=kdd-cup-24-crag-service")
@@ -43,35 +37,34 @@ function stop_retrieval_tool() {
    done
 }
 echo "workpath: $WORKPATH"
-echo "::group::=================== Stop containers ===================="
+echo "=================== Stop containers ===================="
 stop_crag
 stop_agent_docker
 stop_retrieval_tool
-echo "::endgroup::=================== Stop containers completed ===================="

 cd $WORKPATH/tests

-echo "::group::=================== #1 Building docker images===================="
-bash step1_build_images.sh rocm > docker_image_build.log
-echo "::endgroup::=================== #1 Building docker images completed===================="
+echo "=================== #1 Building docker images===================="
+bash step1_build_images.sh
+echo "=================== #1 Building docker images completed===================="

-echo "::group::=================== #2 Start retrieval tool===================="
+echo "=================== #2 Start retrieval tool===================="
 bash step2_start_retrieval_tool.sh
-echo "::endgroup::=================== #2 Retrieval tool started===================="
+echo "=================== #2 Retrieval tool started===================="

-echo "::group::=================== #3 Ingest data and validate retrieval===================="
+echo "=================== #3 Ingest data and validate retrieval===================="
 bash step3_ingest_data_and_validate_retrieval.sh
-echo "::endgroup::=================== #3 Data ingestion and validation completed===================="
+echo "=================== #3 Data ingestion and validation completed===================="

-echo "::group::=================== #4 Start agent and API server===================="
+echo "=================== #4 Start agent and API server===================="
 bash step4a_launch_and_validate_agent_tgi_on_rocm.sh
-echo "::endgroup::=================== #4 Agent test passed ===================="
+echo "=================== #4 Agent test passed ===================="

-echo "::group::=================== #5 Stop agent and API server===================="
+echo "=================== #5 Stop agent and API server===================="
 stop_crag
 stop_agent_docker
 stop_retrieval_tool
-echo "::endgroup::=================== #5 Agent and API server stopped===================="
+echo "=================== #5 Agent and API server stopped===================="

 echo y | docker system prune

--- a/AgentQnA/tests/test_compose_vllm_on_rocm.sh
+++ b/AgentQnA/tests/test_compose_vllm_on_rocm.sh
@@ -5,18 +5,13 @@
 set -e

 WORKPATH=$(dirname "$PWD")
+export LOG_PATH=${WORKPATH}
 export WORKDIR=${WORKPATH}/../../
 echo "WORKDIR=${WORKDIR}"
 export ip_address=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export TOOLSET_PATH=$WORKPATH/tools/
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-export REGISTRY=${IMAGE_REPO}
-export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}
+export MODEL_CACHE="./data"

 function stop_crag() {
    cid=$(docker ps -aq --filter "name=kdd-cup-24-crag-service")
@@ -37,35 +32,34 @@ function stop_retrieval_tool() {
 }

 echo "workpath: $WORKPATH"
-echo "::group::=================== Stop containers ===================="
+echo "=================== Stop containers ===================="
 stop_crag
 stop_agent_docker
 stop_retrieval_tool
-echo "::endgroup::"

 cd $WORKPATH/tests

-echo "::group::=================== #1 Building docker images===================="
-bash step1_build_images.sh rocm_vllm > docker_image_build.log
-echo "::endgroup::=================== #1 Building docker images completed===================="
+echo "=================== #1 Building docker images===================="
+bash step1_build_images_rocm_vllm.sh
+echo "=================== #1 Building docker images completed===================="

-echo "::group::=================== #2 Start retrieval tool===================="
+echo "=================== #2 Start retrieval tool===================="
 bash step2_start_retrieval_tool_rocm_vllm.sh
-echo "::endgroup::=================== #2 Retrieval tool started===================="
+echo "=================== #2 Retrieval tool started===================="

-echo "::group::=================== #3 Ingest data and validate retrieval===================="
+echo "=================== #3 Ingest data and validate retrieval===================="
 bash step3_ingest_data_and_validate_retrieval_rocm_vllm.sh
-echo "::endgroup::=================== #3 Data ingestion and validation completed===================="
+echo "=================== #3 Data ingestion and validation completed===================="

-echo "::group::=================== #4 Start agent and API server===================="
+echo "=================== #4 Start agent and API server===================="
 bash step4_launch_and_validate_agent_rocm_vllm.sh
-echo "::endgroup::=================== #4 Agent test passed ===================="
+echo "=================== #4 Agent test passed ===================="

-echo "::group::=================== #5 Stop agent and API server===================="
+echo "=================== #5 Stop agent and API server===================="
 stop_crag
 stop_agent_docker
 stop_retrieval_tool
-echo "::endgroup::=================== #5 Agent and API server stopped===================="
+echo "=================== #5 Agent and API server stopped===================="

 echo y | docker system prune

--- a/AgentQnA/tools/worker_agent_tools.py
+++ b/AgentQnA/tools/worker_agent_tools.py
@@ -12,7 +12,7 @@ def search_knowledge_base(query: str) -> str:
    print(url)
    proxies = {"http": ""}
    payload = {
-        "messages": query,
+        "text": query,
    }
    response = requests.post(url, json=payload, proxies=proxies)
    print(response)
--- a/AgentQnA/ui/docker/Dockerfile
+++ b/AgentQnA/ui/docker/Dockerfile
@@ -1,49 +1,26 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-#FROM python:3.11-slim
-FROM node:22.9.0
+# Use node 20.11.1 as the base image
+FROM node:20.11.1

-ENV LANG=C.UTF-8
-ARG ARCH=cpu
+# Update package manager and install Git
+RUN apt-get update -y && apt-get install -y git

-RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
-    build-essential \
-    libgl1-mesa-glx \
-    libjemalloc-dev \
-    git \
-    python3-venv
+# Copy the front-end code repository
+COPY svelte /home/user/svelte

+# Set the working directory
+WORKDIR /home/user/svelte

-WORKDIR /root/
+# Install front-end dependencies
+RUN npm install

-ENV HOME=/root
-ENV VIRTUAL_ENV=$HOME/.env/open-webui
-
-COPY open_webui_patches /root/patches
-
-RUN git clone https://github.com/open-webui/open-webui.git && \
-    git config --global user.name "opea" && git config --global user.email "" && \
-    mkdir -p $HOME/.env && python3 -m venv $VIRTUAL_ENV && \
-    $VIRTUAL_ENV/bin/python -m pip install --no-cache-dir --upgrade pip && \
-    $VIRTUAL_ENV/bin/python -m pip install --no-cache-dir build
-
-WORKDIR /root/open-webui
-
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
-
-RUN git checkout v0.5.20 && \
-    git am ../patches/*.patch && \
-    python -m build && \
-    pip install --no-cache-dir dist/open_webui-0.5.20-py3-none-any.whl    
-
-ENV LANG=en_US.UTF-8
-
-WORKDIR /root/
-
-RUN rm -fr /root/open-webui && rm -fr /root/patches
-
-# CMD ["/bin/bash"]
-ENTRYPOINT ["open-webui", "serve"]
+# Build the front-end application
+RUN npm run build

+# Expose the port of the front-end application
+EXPOSE 5173

+# Run the front-end application in preview mode
+CMD ["npm", "run", "preview", "--", "--port", "5173", "--host", "0.0.0.0"]
--- a/AgentQnA/ui/open_webui_patches/0001-compatible-opea-agent-tool-content.patch
+++ b/AgentQnA/ui/open_webui_patches/0001-compatible-opea-agent-tool-content.patch
@@ -1,26 +1,17 @@
-From d90ba418f866bc11848d7d6507aabc6b5e8cc3e2 Mon Sep 17 00:00:00 2001
+From 799dcc304b3aecf2e2969df47c8dcac16d2267b0 Mon Sep 17 00:00:00 2001
 From: lkk12014402 <kaokao.lv@intel.com>
-Date: Mon, 7 Apr 2025 07:22:53 +0000
-Subject: [PATCH] compatible opea agent tool content
+Date: Fri, 4 Apr 2025 07:40:30 +0000
+Subject: [PATCH] deal opea agent tool content.

 ---
- backend/open_webui/utils/middleware.py | 56 ++++++++++++++++++++++++++
- 1 file changed, 56 insertions(+)
+ backend/open_webui/utils/middleware.py | 54 ++++++++++++++++++++++++++
+ 1 file changed, 54 insertions(+)

 diff --git a/backend/open_webui/utils/middleware.py b/backend/open_webui/utils/middleware.py
-index 289d887df..fddbe8ee1 100644
+index 289d887df..afa0edf1e 100644
 --- a/backend/open_webui/utils/middleware.py
 +++ b/backend/open_webui/utils/middleware.py
-@@ -1465,6 +1465,8 @@ async def process_chat_response(
-                 async def stream_body_handler(response):
-                     nonlocal content
-                     nonlocal content_blocks
-+                    nonlocal events
-+                    sources = []
- 
-                     response_tool_calls = []
- 
-@@ -1486,6 +1488,60 @@ async def process_chat_response(
+@@ -1486,6 +1486,60 @@ async def process_chat_response(
                         try:
                             data = json.loads(data)
 
--- a/AgentQnA/ui/open_webui_patches/0002-update-agent-icloud-upload-feature.patch
+++ b/AgentQnA/ui/open_webui_patches/0002-update-agent-icloud-upload-feature.patch
@@ -1,531 +0,0 @@
-From 8ad31e50644eab3c9e698d7828b1857919887841 Mon Sep 17 00:00:00 2001
-From: lkk12014402 <kaokao.lv@intel.com>
-Date: Tue, 8 Apr 2025 03:38:09 +0000
-Subject: [PATCH 2/2] update agent icloud upload feature
-
---
- src/lib/apis/knowledge/index.ts               |  60 +++++++
- .../admin/Settings/Connections.svelte         |  50 +++++-
- .../components/icons/UploadCloudIcon.svelte   |  18 ++
- src/lib/components/workspace/Knowledge.svelte |  57 +++++-
- .../KnowledgeBase/AddIcloudContentMenu.svelte | 164 ++++++++++++++++++
- .../KnowledgeBase/IcloudFiles.svelte          |  37 ++++
- src/lib/i18n/locales/zh-CN/translation.json   |  15 +-
- 7 files changed, 396 insertions(+), 5 deletions(-)
- create mode 100644 src/lib/components/icons/UploadCloudIcon.svelte
- create mode 100644 src/lib/components/workspace/Knowledge/KnowledgeBase/AddIcloudContentMenu.svelte
- create mode 100644 src/lib/components/workspace/Knowledge/KnowledgeBase/IcloudFiles.svelte
-
-diff --git a/src/lib/apis/knowledge/index.ts b/src/lib/apis/knowledge/index.ts
-index c5fad1323..32be528a7 100644
--- a/src/lib/apis/knowledge/index.ts
-+++ b/src/lib/apis/knowledge/index.ts
-@@ -345,3 +345,63 @@ export const deleteKnowledgeById = async (token: string, id: string) => {
- 
- 	return res;
- };
-+
-+export const getIcloudFiles = async (ICLOUD_BASE_URLS: string) => {
-+	let error = null;
-+
-+	const res = await fetch(`${ICLOUD_BASE_URLS}/dataprep/get`, {
-+		method: 'POST',
-+		headers: {
-+			Accept: 'application/json',
-+			'Content-Type': 'application/json',
-+		}
-+	})
-+		.then(async (res) => {
-+			if (!res.ok) throw await res.json();
-+			return res.json();
-+		})
-+		.then((json) => {
-+			return json;
-+		})
-+		.catch((err) => {
-+			error = err.detail;
-+
-+			console.log(err);
-+			return null;
-+		});
-+
-+	if (error) {
-+		throw error;
-+	}
-+
-+	return res;
-+};
-+
-+export const updateIcloudFiles = async (ICLOUD_BASE_URLS: string, formData: any) => {
-+	let error = null;
-+
-+	const res = await fetch(`${ICLOUD_BASE_URLS}/dataprep/ingest`, {
-+		method: 'POST',
-+		body: formData
-+	})
-+		.then(async (res) => {
-+			if (!res.ok) throw await res.json();
-+			return res.json();
-+		})
-+		.then((json) => {
-+			return json;
-+		})
-+		.catch((err) => {
-+			error = err.detail;
-+
-+			console.log(err);
-+			return null;
-+		});
-+
-+	if (error) {
-+		throw error;
-+	}
-+
-+	return res;
-+};
-+
-diff --git a/src/lib/components/admin/Settings/Connections.svelte b/src/lib/components/admin/Settings/Connections.svelte
-index 2fcfadaec..3237744d5 100644
--- a/src/lib/components/admin/Settings/Connections.svelte
-+++ b/src/lib/components/admin/Settings/Connections.svelte
-@@ -47,6 +47,9 @@
- 	let showAddOpenAIConnectionModal = false;
- 	let showAddOllamaConnectionModal = false;
- 
-+	let ENABLE_ICLOUD_API: null | boolean = (localStorage.getItem('ENABLE_ICLOUD_API') === "enable");
-+	let ICLOUD_BASE_URL = localStorage.getItem('ICLOUD_BASE_URL') || '';
-+
- 	const updateOpenAIHandler = async () => {
- 		if (ENABLE_OPENAI_API !== null) {
- 			// Remove trailing slashes
-@@ -193,10 +196,22 @@
- 		}
- 	});
- 
-+	const updateIcloudHandler = async () => {
-+		if (ENABLE_ICLOUD_API) {
-+			localStorage.setItem('ICLOUD_BASE_URL', ICLOUD_BASE_URL);
-+			localStorage.setItem('ENABLE_ICLOUD_API', "enable");
-+		} else {
-+			localStorage.setItem('ICLOUD_BASE_URL', '');
-+			localStorage.setItem('ENABLE_ICLOUD_API', "");
-+		}
-+		toast.success($i18n.t('Icloud API settings updated'));
-+	};
-+
- 	const submitHandler = async () => {
- 		updateOpenAIHandler();
- 		updateOllamaHandler();
- 		updateDirectConnectionsHandler();
-+		updateIcloudHandler();
- 
- 		dispatch('save');
- 	};
-@@ -301,7 +316,7 @@
- 				</div>
- 
- 				{#if ENABLE_OLLAMA_API}
-					<hr class=" border-gray-100 dark:border-gray-850 my-2" />
-+					<hr class=" border-gray-100 dark:border-gray-850" />
- 
- 					<div class="">
- 						<div class="flex justify-between items-center">
-@@ -358,6 +373,39 @@
- 				{/if}
- 			</div>
- 
-+			<hr class=" border-gray-50 dark:border-gray-850" />
-+
-+			<div class="pr-1.5 my-2">
-+				<div class="flex justify-between items-center text-sm">
-+					<div class="font-medium">{$i18n.t('Icloud File API')}</div>
-+
-+					<div class="mt-1">
-+						<Switch 
-+							bind:state={ENABLE_ICLOUD_API} 
-+							on:change={async () => {
-+								updateIcloudHandler();
-+							}}
-+						/>
-+					</div>
-+				</div>
-+
-+				{#if ENABLE_ICLOUD_API}
-+					<hr class=" border-gray-50 dark:border-gray-850 my-2" />
-+
-+					<div class="">
-+						<div class="flex w-full gap-1.5">
-+							<div class="flex-1 flex flex-col gap-1.5">
-+								<input
-+									class="w-full text-sm bg-transparent outline-none"
-+									placeholder={$i18n.t('Enter Icloud URL(e.g.') + 'http://localhost:6007/v1)'}
-+									bind:value={ICLOUD_BASE_URL}
-+								/>
-+							</div>
-+						</div>
-+					</div>
-+				{/if}
-+			</div>
-+
- 			<hr class=" border-gray-100 dark:border-gray-850" />
- 
- 			<div class="pr-1.5 my-2">
-diff --git a/src/lib/components/icons/UploadCloudIcon.svelte b/src/lib/components/icons/UploadCloudIcon.svelte
-new file mode 100644
-index 000000000..eed3bd582
--- /dev/null
-+++ b/src/lib/components/icons/UploadCloudIcon.svelte
-@@ -0,0 +1,18 @@
-+<script lang="ts">
-+	export let className = 'w-4 h-4';
-+</script>
-+
-+<svg
-+	t="1744007283647"
-+	viewBox="0 0 1491 1024"
-+	version="1.1"
-+	xmlns="http://www.w3.org/2000/svg"
-+	p-id="1630"
-+	class = {className}
-+	><path
-+		d="M546.047379 263.651842s-90.221363-91.423424-212.63125-16.762074c-109.521121 71.300031-90.154581 201.768179-90.154582 201.76818S0 498.498962 0 759.902727c5.431535 261.003078 264.186314 263.674325 264.186314 263.674326l388.443814 0.422947V744.565318H466.355181l279.434681-279.412421 279.390161 279.412421h-186.297208V1024l377.157796-0.422947s240.812904 0.222604 274.648698-248.092052c16.094262-271.576764-232.754643-325.113003-232.754643-325.113003S1286.205362 48.327085 936.761752 2.470681C637.181417-29.740104 546.047379 263.651842 546.047379 263.651842z"
-+		fill="#507BFC"
-+		p-id="1631"
-+	></path></svg
-+>
-+
-diff --git a/src/lib/components/workspace/Knowledge.svelte b/src/lib/components/workspace/Knowledge.svelte
-index 57d45312d..43a1f305e 100644
--- a/src/lib/components/workspace/Knowledge.svelte
-+++ b/src/lib/components/workspace/Knowledge.svelte
-@@ -13,7 +13,8 @@
- 	import {
- 		getKnowledgeBases,
- 		deleteKnowledgeById,
-		getKnowledgeBaseList
-+		getKnowledgeBaseList,
-+		getIcloudFiles
- 	} from '$lib/apis/knowledge';
- 
- 	import { goto } from '$app/navigation';
-@@ -26,6 +27,11 @@
- 	import Spinner from '../common/Spinner.svelte';
- 	import { capitalizeFirstLetter } from '$lib/utils';
- 	import Tooltip from '../common/Tooltip.svelte';
-+	import AddIcloudConnectionModal from '$lib/components/workspace/Knowledge/KnowledgeBase/AddIcloudContentMenu.svelte';
-+	import IcloudFiles from '$lib/components/workspace/Knowledge/KnowledgeBase/IcloudFiles.svelte';
-+
-+	let showAddTextContentModal = false;
-+	let IcloudFile = [];
- 
- 	let loaded = false;
- 
-@@ -65,9 +71,26 @@
- 	};
- 
- 	onMount(async () => {
-+		await updateIcloudFiles();
-+
- 		knowledgeBases = await getKnowledgeBaseList(localStorage.token);
- 		loaded = true;
- 	});
-+
-+	async function updateIcloudFiles() {
-+		let ICLOUD_BASE_URL = localStorage.getItem('ICLOUD_BASE_URL') || '';
-+		console.log('ICLOUD_BASE_URL', ICLOUD_BASE_URL);
-+		
-+		if (ICLOUD_BASE_URL !== '') {
-+			const res = await getIcloudFiles(ICLOUD_BASE_URL).catch((e) => {
-+				toast.error(`${e}`);
-+			});
-+
-+			if (res) {
-+				IcloudFile = res;
-+			}
-+		}
-+	}
- </script>
- 
- <svelte:head>
-@@ -187,11 +210,39 @@
- 		{/each}
- 	</div>
- 
-	<div class=" text-gray-500 text-xs mt-1 mb-2">
-		ⓘ {$i18n.t("Use '#' in the prompt input to load and include your knowledge.")}
-+	<div class="flex justify-between items-center">
-+		<div class="flex md:self-center text-xl font-medium px-0.5 items-center">
-+			{$i18n.t('Icloud Knowledge')}
-+			<div class="flex self-center w-[1px] h-6 mx-2.5 bg-gray-50 dark:bg-gray-850" />
-+			<span class="text-lg font-medium text-gray-500 dark:text-gray-300">{IcloudFile.length}</span>
-+		</div>
-+		<div>
-+			<button
-+				class=" px-2 py-2 rounded-xl hover:bg-gray-700/10 dark:hover:bg-gray-100/10 dark:text-gray-300 dark:hover:text-white transition font-medium text-sm flex items-center space-x-1"
-+				aria-label={$i18n.t('Upload to Icloud')}
-+				on:click={() => {
-+					showAddTextContentModal = !showAddTextContentModal;
-+				}}
-+			>
-+				<Plus className="size-3.5" />
-+			</button>
-+		</div>
-+	</div>
-+	<hr class="border-gray-100 dark:border-gray-850 my-2" />
-+	<div class=" flex overflow-y-auto w-full h-[15rem] scrollbar-hidden text-xs">
-+		<IcloudFiles files={IcloudFile} />
- 	</div>
- {:else}
- 	<div class="w-full h-full flex justify-center items-center">
- 		<Spinner />
- 	</div>
- {/if}
-+
-+<AddIcloudConnectionModal
-+	bind:show={showAddTextContentModal}
-+	on:updateIcloudFile={async (e) => {
-+		if (e.detail.status) {
-+			await updateIcloudFiles();
-+		}
-+	}}
-+/>
-diff --git a/src/lib/components/workspace/Knowledge/KnowledgeBase/AddIcloudContentMenu.svelte b/src/lib/components/workspace/Knowledge/KnowledgeBase/AddIcloudContentMenu.svelte
-new file mode 100644
-index 000000000..fb906a0d3
--- /dev/null
-+++ b/src/lib/components/workspace/Knowledge/KnowledgeBase/AddIcloudContentMenu.svelte
-@@ -0,0 +1,164 @@
-+<script lang="ts">
-+	import { toast } from 'svelte-sonner';
-+	import { getContext, onMount, createEventDispatcher } from 'svelte';
-+	import Modal from '$lib/components/common/Modal.svelte';
-+	import UploadCloudIcon from '$lib/components/icons/UploadCloudIcon.svelte';
-+	import Spinner from '$lib/components/common/Spinner.svelte';
-+	import { updateIcloudFiles } from '$lib/apis/knowledge';
-+
-+	const i18n = getContext('i18n');
-+	const dispatch = createEventDispatcher();
-+
-+	export let show = false;
-+
-+	let url = '';
-+
-+	let loading = false;
-+
-+	let selectedFile = null;
-+
-+	function handleFileSelect(event) {
-+		selectedFile = event.target.files[0];
-+	}
-+
-+	function parseAndValidateUrls(normalizedInput: string): string[] {
-+		return normalizedInput
-+			.split(',')
-+			.map((candidate) => {
-+				const processed = candidate.replace(/^["']+|["']+$/g, '').trim();
-+
-+				try {
-+					new URL(processed);
-+					return processed;
-+				} catch {
-+					return null;
-+				}
-+			})
-+			.filter((url): url is string => url !== null);
-+	}
-+
-+	async function submitHandler() {
-+		loading = true;
-+
-+		if (!url && !selectedFile) {
-+			loading = false;
-+			show = false;
-+
-+			toast.error($i18n.t('URL or File are required'));
-+			return;
-+		}
-+		if (url && selectedFile) {
-+			loading = false;
-+			show = false;
-+
-+			toast.error($i18n.t('Upload file or enter URL'));
-+			url = '';
-+			selectedFile = null;
-+			return;
-+		}
-+
-+		const formData = new FormData();
-+		if (url) {
-+			formData.append('link_list', JSON.stringify(parseAndValidateUrls(url)));
-+		}
-+		if (selectedFile) {
-+			formData.append('files', selectedFile, selectedFile.name);
-+		}
-+		let ICLOUD_BASE_URL = localStorage.getItem('ICLOUD_BASE_URL') || '';
-+		console.log('ICLOUD_BASE_URL', ICLOUD_BASE_URL);
-+
-+		if (ICLOUD_BASE_URL !== '') {
-+			const res = await updateIcloudFiles(ICLOUD_BASE_URL, formData).catch((e) => {
-+				toast.error(`${e}`);
-+
-+				return;
-+			});
-+
-+			if (res) {
-+				toast.success($i18n.t('Upload Succeed'));
-+				dispatch('updateIcloudFile', { status: true });
-+			}
-+
-+			url = '';
-+			selectedFile = null;
-+			loading = false;
-+			show = false;
-+		}
-+	}
-+</script>
-+
-+<Modal size="sm" bind:show>
-+	<div class="flex flex-col justify-end">
-+		<div class=" flex justify-between dark:text-gray-100 px-5 pt-4 pb-2">
-+			<div class="flex-col text-lg font-medium self-center font-primary">
-+				{$i18n.t('Upload Icloud file')}
-+				<span class="text-sm text-gray-500">- {$i18n.t('choose URL or local file')}</span>
-+			</div>
-+
-+			<button
-+				class="self-center"
-+				on:click={() => {
-+					show = false;
-+				}}
-+			>
-+				<svg
-+					xmlns="http://www.w3.org/2000/svg"
-+					viewBox="0 0 20 20"
-+					fill="currentColor"
-+					class="w-5 h-5"
-+				>
-+					<path
-+						d="M6.28 5.22a.75.75 0 00-1.06 1.06L8.94 10l-3.72 3.72a.75.75 0 101.06 1.06L10 11.06l3.72 3.72a.75.75 0 101.06-1.06L11.06 10l3.72-3.72a.75.75 0 00-1.06-1.06L10 8.94 6.28 5.22z"
-+					/>
-+				</svg>
-+			</button>
-+		</div>
-+
-+		<div class="flex flex-col md:flex-row w-full px-4 pb-4 md:space-x-4 dark:text-gray-200">
-+			<div class=" flex flex-col w-full sm:flex-row sm:justify-center sm:space-x-6">
-+				<div class="flex items-center w-full">
-+					<div class="flex-1 min-w-0 mr-2">
-+						<div class="flex flex-col w-full my-8 mx-2">
-+							<input
-+								class="w-full text-sm bg-transparent placeholder:text-gray-300 outline-none border-b-solid border-b-2 border-blue-500 rounded p-2"
-+								type="text"
-+								bind:value={url}
-+								placeholder={$i18n.t('Upload from URL')}
-+							/>
-+						</div>
-+					</div>
-+
-+					<div class="flex-none w-[1px] h-[60%] mx-2.5 bg-gray-300"></div>
-+
-+					<div class="flex-1 min-w-0">
-+						<input type="file" id="fileInput" hidden on:change={handleFileSelect} />
-+
-+						<label
-+							for="fileInput"
-+							class="cursor-pointer flex flex-col items-center hover:bg-gray-100 rounded-lg p-2 transition-colors"
-+						>
-+							<UploadCloudIcon className="w-12 h-12 text-gray-500" />
-+							<div class="text-xs text-gray-500 pt-2">
-+								{selectedFile ? selectedFile.name : '点击上传文件'}
-+							</div>
-+						</label>
-+					</div>
-+				</div>
-+			</div>
-+		</div>
-+		{#if loading}
-+			<Spinner className="my-4 size-4" />
-+		{:else}
-+			<button
-+				class="bg-blue-500 hover:bg-blue-700 text-white font-bold py-3 px-4 rounded text-sm"
-+				on:click={(e) => {
-+					e.preventDefault();
-+					submitHandler();
-+				}}
-+			>
-+				{$i18n.t('Upload Confirm')}
-+			</button>
-+		{/if}
-+	</div>
-+</Modal>
-+
-diff --git a/src/lib/components/workspace/Knowledge/KnowledgeBase/IcloudFiles.svelte b/src/lib/components/workspace/Knowledge/KnowledgeBase/IcloudFiles.svelte
-new file mode 100644
-index 000000000..d6490dce2
--- /dev/null
-+++ b/src/lib/components/workspace/Knowledge/KnowledgeBase/IcloudFiles.svelte
-@@ -0,0 +1,37 @@
-+<script lang="ts">
-+	export let selectedFileId = null;
-+	export let files = [];
-+
-+	export let small = false;
-+</script>
-+
-+<div class="max-h-full flex flex-col w-full">
-+	{#each files as file}
-+		<div class="mt-1 px-2 flex hover:bg-gray-50 transition">
-+			<div class="p-3 bg-black/20 dark:bg-white/10 text-white rounded-xl my-2">
-+				<svg
-+					xmlns="http://www.w3.org/2000/svg"
-+					viewBox="0 0 24 24"
-+					fill="currentColor"
-+					class=" size-3"
-+				>
-+					<path
-+						fill-rule="evenodd"
-+						d="M5.625 1.5c-1.036 0-1.875.84-1.875 1.875v17.25c0 1.035.84 1.875 1.875 1.875h12.75c1.035 0 1.875-.84 1.875-1.875V12.75A3.75 3.75 0 0 0 16.5 9h-1.875a1.875 1.875 0 0 1-1.875-1.875V5.25A3.75 3.75 0 0 0 9 1.5H5.625ZM7.5 15a.75.75 0 0 1 .75-.75h7.5a.75.75 0 0 1 0 1.5h-7.5A.75.75 0 0 1 7.5 15Zm.75 2.25a.75.75 0 0 0 0 1.5H12a.75.75 0 0 0 0-1.5H8.25Z"
-+						clip-rule="evenodd"
-+					/>
-+					<path
-+						d="M12.971 1.816A5.23 5.23 0 0 1 14.25 5.25v1.875c0 .207.168.375.375.375H16.5a5.23 5.23 0 0 1 3.434 1.279 9.768 9.768 0 0 0-6.963-6.963Z"
-+					/>
-+				</svg>
-+			</div>
-+
-+			<div class="flex flex-col justify-center -space-y-0.5 px-2.5 w-full">
-+				<div class=" dark:text-gray-100 text-sm font-medium line-clamp-1 mb-1">
-+					{file.name}
-+				</div>
-+			</div>
-+		</div>
-+	{/each}
-+</div>
-+
-diff --git a/src/lib/i18n/locales/zh-CN/translation.json b/src/lib/i18n/locales/zh-CN/translation.json
-index ebb53a1b5..d6b72e04d 100644
--- a/src/lib/i18n/locales/zh-CN/translation.json
-+++ b/src/lib/i18n/locales/zh-CN/translation.json
-@@ -1174,5 +1174,18 @@
- 	"Your entire contribution will go directly to the plugin developer; Open WebUI does not take any percentage. However, the chosen funding platform might have its own fees.": "您的全部捐款将直接给到插件开发者，Open WebUI 不会收取任何比例。但众筹平台可能会有服务费、抽成。",
- 	"Youtube": "YouTube",
- 	"Youtube Language": "Youtube 语言",
-	"Youtube Proxy URL": "Youtube 代理 URL"
-+	"Youtube Proxy URL": "Youtube 代理 URL",
-+	"Upload Icloud file": "上传到云端",
-+	"choose URL or local file": "选择URL或本地文件",
-+	"Upload from URL": "从URL上传",
-+	"Upload Confirm": "确认上传",
-+	"URL or File are required": "未上传文件",
-+	"Upload file or enter URL": "文件与URL不能同时提交",
-+	"Icloud File": "云端文件",
-+	"Icloud File API": "云端存储API",
-+	"Enter Icloud URL(e.g.": "输入云端存储URL（例如.",
-+	"Upload to Icloud": "上传到云端",
-+	"Icloud Knowledge": "云端数据库",
-+	"Upload Succeed": "上传文件成功",
-+	"Icloud API settings updated": "云端存储API设置已更新"
- }
-- 
-2.34.1
-
--- a/AgentQnA/ui/open_webui_patches/0003-update-build-script.patch
+++ b/AgentQnA/ui/open_webui_patches/0003-update-build-script.patch
@@ -1,56 +0,0 @@
-From ebf3218eef81897b536521e2140bdd9176f3ace3 Mon Sep 17 00:00:00 2001
-From: lkk12014402 <kaokao.lv@intel.com>
-Date: Tue, 8 Apr 2025 07:13:20 +0000
-Subject: [PATCH 3/3] update build script
-
---
- hatch_build.py | 23 ++++++++++++++++++-----
- 1 file changed, 18 insertions(+), 5 deletions(-)
-
-diff --git a/hatch_build.py b/hatch_build.py
-index 8ddaf0749..e15d6e99d 100644
--- a/hatch_build.py
-+++ b/hatch_build.py
-@@ -3,21 +3,34 @@ import os
- import shutil
- import subprocess
- from sys import stderr
-
-+ 
- from hatchling.builders.hooks.plugin.interface import BuildHookInterface
-
-
-+ 
-+ 
- class CustomBuildHook(BuildHookInterface):
-     def initialize(self, version, build_data):
-         super().initialize(version, build_data)
-        stderr.write(">>> Building Open Webui frontend\n")
-+        stderr.write(">>> Building DCAI小智 frontend\n")
-         npm = shutil.which("npm")
-         if npm is None:
-             raise RuntimeError(
-                "NodeJS `npm` is required for building Open Webui but it was not found"
-+                "NodeJS `npm` is required for building DCAI小智 but it was not found"
-             )
-+        stderr.write("### Installing onnxruntime-node\n")
-+        subprocess.run([npm, "install", "onnxruntime-node", "--onnxruntime-node-install-cuda=skip"], check=True)  # noqa: S603
-+       
-+        stderr.write("### Installing huggingface/transformers.js\n")
-+        subprocess.run([npm, "i", "@huggingface/transformers"], check=True)  # noqa: S603
-+       
-+        ort_version = "1.20.1"
-+        ort_url = f"https://github.com/microsoft/onnxruntime/releases/download/v{ort_version}/onnxruntime-linux-x64-gpu-{ort_version}.tgz"
-+       
-+        stderr.write(f"### Downloading onnxruntime binaries from {ort_url}\n")
-+        subprocess.run(["curl", "-L", ort_url, "-o", f"onnxruntime-linux-x64-gpu-{ort_version}.tgz"], check=True)  # noqa: S603
-+       
-         stderr.write("### npm install\n")
-         subprocess.run([npm, "install"], check=True)  # noqa: S603
-+ 
-         stderr.write("\n### npm run build\n")
-         os.environ["APP_BUILD_HASH"] = version
-         subprocess.run([npm, "run", "build"], check=True)  # noqa: S603
-- 
-2.34.1
-
--- a/AgentQnA/ui/open_webui_patches/0004-enhance-tool-formating.patch
+++ b/AgentQnA/ui/open_webui_patches/0004-enhance-tool-formating.patch
@@ -1,31 +0,0 @@
-From 36d61dab9306cb8f12c4497a32781d84f8cfb2e7 Mon Sep 17 00:00:00 2001
-From: lkk12014402 <kaokao.lv@intel.com>
-Date: Tue, 8 Apr 2025 07:22:36 +0000
-Subject: [PATCH 4/4] enhance tool formatting
-
---
- backend/open_webui/utils/middleware.py | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/backend/open_webui/utils/middleware.py b/backend/open_webui/utils/middleware.py
-index fddbe8ee1..9e44ed91a 100644
--- a/backend/open_webui/utils/middleware.py
-+++ b/backend/open_webui/utils/middleware.py
-@@ -1142,12 +1142,12 @@ async def process_chat_response(
-                                 result_display_content = f"{result_display_content}\n> {tool_name}: {result.get('content', '')}"
- 
-                             if not raw:
-                                content = f'{content}\n<details type="tool_calls" done="true" content="{html.escape(json.dumps(block_content))}" results="{html.escape(json.dumps(results))}">\n<summary>Tool Executed</summary>\n{result_display_content}\n</details>\n'
-+                                content = f'{content}\n<details type="tool_calls" done="true" content="{html.escape(json.dumps(block_content))}" results="{html.escape(json.dumps(results))}">\n<summary> Tool: {tool_call.get('function', {}).get('name', '')} Executed</summary>\n{result_display_content}\n</details>\n'
-                         else:
-                             tool_calls_display_content = ""
- 
-                             for tool_call in block_content:
-                                tool_calls_display_content = f"{tool_calls_display_content}\n> Executing {tool_call.get('function', {}).get('name', '')}"
-+                                tool_calls_display_content = f"{tool_calls_display_content}\n> Executing Tool: {tool_call.get('function', {}).get('name', '')}"
- 
-                             if not raw:
-                                 content = f'{content}\n<details type="tool_calls" done="false" content="{html.escape(json.dumps(block_content))}">\n<summary>Tool Executing...</summary>\n{tool_calls_display_content}\n</details>\n'
-- 
-2.34.1
-
--- a/AgentQnA/ui/open_webui_patches/0005-fix-tool-call-typo.patch
+++ b/AgentQnA/ui/open_webui_patches/0005-fix-tool-call-typo.patch
@@ -1,25 +0,0 @@
-From 4723fb2df86df3e1c300f12fc0649823ea1a753b Mon Sep 17 00:00:00 2001
-From: lkk12014402 <kaokao.lv@intel.com>
-Date: Tue, 8 Apr 2025 08:09:36 +0000
-Subject: [PATCH 5/5] fix tool call typo.
-
---
- backend/open_webui/utils/middleware.py | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/backend/open_webui/utils/middleware.py b/backend/open_webui/utils/middleware.py
-index 9e44ed91a..82aed5346 100644
--- a/backend/open_webui/utils/middleware.py
-+++ b/backend/open_webui/utils/middleware.py
-@@ -1142,7 +1142,7 @@ async def process_chat_response(
-                                 result_display_content = f"{result_display_content}\n> {tool_name}: {result.get('content', '')}"
- 
-                             if not raw:
-                                content = f'{content}\n<details type="tool_calls" done="true" content="{html.escape(json.dumps(block_content))}" results="{html.escape(json.dumps(results))}">\n<summary> Tool: {tool_call.get('function', {}).get('name', '')} Executed</summary>\n{result_display_content}\n</details>\n'
-+                                content = f'{content}\n<details type="tool_calls" done="true" content="{html.escape(json.dumps(block_content))}" results="{html.escape(json.dumps(results))}">\n<summary> Tool: {tool_call.get("function", {}).get("name", "")} Executed</summary>\n{result_display_content}\n</details>\n'
-                         else:
-                             tool_calls_display_content = ""
- 
-- 
-2.34.1
-
--- a/AudioQnA/Dockerfile
+++ b/AudioQnA/Dockerfile
@@ -1,9 +1,8 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-ARG IMAGE_REPO=opea
 ARG BASE_TAG=latest
-FROM $IMAGE_REPO/comps-base:$BASE_TAG
+FROM opea/comps-base:$BASE_TAG

 COPY ./audioqna.py $HOME/audioqna.py

--- a/AudioQnA/Dockerfile.multilang
+++ b/AudioQnA/Dockerfile.multilang
@@ -1,9 +1,8 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-ARG IMAGE_REPO=opea
 ARG BASE_TAG=latest
-FROM $IMAGE_REPO/comps-base:$BASE_TAG
+FROM opea/comps-base:$BASE_TAG

 COPY ./audioqna_multilang.py $HOME/audioqna_multilang.py

--- a/AudioQnA/README.md
+++ b/AudioQnA/README.md
@@ -2,13 +2,6 @@

 AudioQnA is an example that demonstrates the integration of Generative AI (GenAI) models for performing question-answering (QnA) on audio files, with the added functionality of Text-to-Speech (TTS) for generating spoken responses. The example showcases how to convert audio input to text using Automatic Speech Recognition (ASR), generate answers to user queries using a language model, and then convert those answers back to speech using Text-to-Speech (TTS).

-## Table of Contents
-
-1. [Architecture](#architecture)
-2. [Deployment Options](#deployment-options)
-
-## Architecture
-
 The AudioQnA example is implemented using the component-level microservices defined in [GenAIComps](https://github.com/opea-project/GenAIComps). The flow chart below shows the information flow between different microservices for this example.

 ```mermaid
@@ -66,13 +59,37 @@ flowchart LR

 ```

-## Deployment Options
+## Deploy AudioQnA Service

-The table below lists currently available deployment options. They outline in detail the implementation of this example on selected hardware.
+The AudioQnA service can be deployed on either Intel Gaudi2 or Intel Xeon Scalable Processor.

-| Category               | Deployment Option | Description                                                      |
-| ---------------------- | ----------------- | ---------------------------------------------------------------- |
-| On-premise Deployments | Docker compose    | [AudioQnA deployment on Xeon](./docker_compose/intel/cpu/xeon)   |
-|                        |                   | [AudioQnA deployment on Gaudi](./docker_compose/intel/hpu/gaudi) |
-|                        |                   | [AudioQnA deployment on AMD ROCm](./docker_compose/amd/gpu/rocm) |
-|                        | Kubernetes        | [Helm Charts](./kubernetes/helm)                                 |
+### Deploy AudioQnA on Gaudi
+
+Refer to the [Gaudi Guide](./docker_compose/intel/hpu/gaudi/README.md) for instructions on deploying AudioQnA on Gaudi.
+
+### Deploy AudioQnA on Xeon
+
+Refer to the [Xeon Guide](./docker_compose/intel/cpu/xeon/README.md) for instructions on deploying AudioQnA on Xeon.
+
+## Deploy using Helm Chart
+
+Refer to the [AudioQnA helm chart](./kubernetes/helm/README.md) for instructions on deploying AudioQnA on Kubernetes.
+
+## Supported Models
+
+### ASR
+
+The default model is [openai/whisper-small](https://huggingface.co/openai/whisper-small). It also supports all models in the Whisper family, such as `openai/whisper-large-v3`, `openai/whisper-medium`, `openai/whisper-base`, `openai/whisper-tiny`, etc.
+
+To replace the model, please edit the `compose.yaml` and add the `command` line to pass the name of the model you want to use:
+
+```yaml
+services:
+  whisper-service:
+    ...
+    command: --model_name_or_path openai/whisper-tiny
+```
+
+### TTS
+
+The default model is [microsoft/SpeechT5](https://huggingface.co/microsoft/speecht5_tts). We currently do not support replacing the model. More models under the commercial license will be added in the future.
--- a/AudioQnA/README_miscellaneous.md
+++ b/AudioQnA/README_miscellaneous.md
@@ -1,42 +0,0 @@
-# AudioQnA Docker Image Build
-
-## Table of Contents
-
-1. [Build MegaService Docker Image](#build-megaservice-docker-image)
-2. [Build UI Docker Image](#build-ui-docker-image)
-3. [Generate a HuggingFace Access Token](#generate-a-huggingface-access-token)
-4. [Troubleshooting](#troubleshooting)
-
-## Build MegaService Docker Image
-
-To construct the Megaservice of AudioQnA, the [GenAIExamples](https://github.com/opea-project/GenAIExamples.git) repository is utilized. Build Megaservice Docker image using command below:
-
-```bash
-git clone https://github.com/opea-project/GenAIExamples.git
-cd GenAIExamples/AudioQnA
-docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-```
-
-## Build UI Docker Image
-
-Build frontend Docker image using below command:
-
-```bash
-cd GenAIExamples/AudioQnA/ui
-docker build -t opea/audioqna-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
-```
-
-## Generate a HuggingFace Access Token
-
-Some HuggingFace resources, such as some models, are only accessible if the developer has an access token. In the absence of a HuggingFace access token, the developer can create one by first creating an account by following the steps provided at [HuggingFace](https://huggingface.co/) and then generating a [user access token](https://huggingface.co/docs/transformers.js/en/guides/private#step-1-generating-a-user-access-token).
-
-## Troubleshooting
-
-1. If you get errors like "Access Denied", [validate micro service](https://github.com/opea-project/GenAIExamples/tree/main/AudioQnA/docker_compose/intel/cpu/xeon/README.md#validate-microservices) first. A simple example:
-
-   ```bash
-   curl http://${host_ip}:7055/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
-   ```
-
-2. (Docker only) If all microservices work well, check the port ${host_ip}:7777, the port may be allocated by other users, you can modify the `compose.yaml`.
-3. (Docker only) If you get errors like "The container name is in use", change container name in `compose.yaml`.
--- a/AudioQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/README.md
@@ -1,59 +1,120 @@
-# Deploying AudioQnA on AMD ROCm GPU
+# Build Mega Service of AudioQnA on AMD ROCm GPU

-This document outlines the single node deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservices on server with AMD ROCm processing accelerators. The steps include pulling Docker images, container deployment via Docker Compose, and service execution using microservices `llm`.
+This document outlines the deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice
+pipeline on server on AMD ROCm GPU platform.

-Note: The default LLM is `Intel/neural-chat-7b-v3-3`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
+## Build Docker Images

-## Table of Contents
+### 1. Build Docker Image

-1. [AudioQnA Quick Start Deployment](#audioqna-quick-start-deployment)
-2. [AudioQnA Docker Compose Files](#audioqna-docker-compose-files)
-3. [Validate Microservices](#validate-microservices)
-4. [Conclusion](#conclusion)
+- #### Create application install directory and go to it:

-## AudioQnA Quick Start Deployment
+  ```bash
+  mkdir ~/audioqna-install && cd audioqna-install
+  ```

-This section describes how to quickly deploy and test the AudioQnA service manually on an AMD ROCm platform. The basic steps are:
+- #### Clone the repository GenAIExamples (the default repository branch "main" is used here):

-1. [Access the Code](#access-the-code)
-2. [Configure the Deployment Environment](#configure-the-deployment-environment)
-3. [Deploy the Services Using Docker Compose](#deploy-the-services-using-docker-compose)
-4. [Check the Deployment Status](#check-the-deployment-status)
-5. [Validate the Pipeline](#validate-the-pipeline)
-6. [Cleanup the Deployment](#cleanup-the-deployment)
+  ```bash
+  git clone https://github.com/opea-project/GenAIExamples.git
+  ```

-### Access the Code
+  If you need to use a specific branch/tag of the GenAIExamples repository, then (v1.3 replace with its own value):

-Clone the GenAIExample repository and access the AudioQnA AMD ROCm platform Docker Compose files and supporting scripts:
+  ```bash
+  git clone https://github.com/opea-project/GenAIExamples.git && cd GenAIExamples && git checkout v1.3
+  ```
+
+  We remind you that when using a specific version of the code, you need to use the README from this version:
+
+- #### Go to build directory:
+
+  ```bash
+  cd ~/audioqna-install/GenAIExamples/AudioQnA/docker_image_build
+  ```
+
+- Cleaning up the GenAIComps repository if it was previously cloned in this directory.
+  This is necessary if the build was performed earlier and the GenAIComps folder exists and is not empty:
+
+  ```bash
+  echo Y | rm -R GenAIComps
+  ```
+
+- #### Clone the repository GenAIComps (the default repository branch "main" is used here):

 ```bash
-git clone https://github.com/opea-project/GenAIExamples.git
-cd GenAIExamples/AudioQnA
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
 ```

-Then checkout a released version, such as v1.3:
+We remind you that when using a specific version of the code, you need to use the README from this version.

-```bash
-git checkout v1.3
-```
+- #### Setting the list of images for the build (from the build file.yaml)

-### Configure the Deployment Environment
+  If you want to deploy a vLLM-based or TGI-based application, then the set of services is installed as follows:

-#### Docker Compose GPU Configuration
+  #### vLLM-based application

-Consult the section on [AudioQnA Service configuration](#audioqna-configuration) for information on how service specific configuration parameters affect deployments.
+  ```bash
+  service_list="vllm-rocm whisper speecht5 audioqna audioqna-ui"
+  ```

-To enable GPU support for AMD GPUs, the following configuration is added to the Docker Compose files (`compose.yaml`, `compose_vllm.yaml`) for the LLM serving container:
+  #### TGI-based application
+
+  ```bash
+  service_list="whisper speecht5 audioqna audioqna-ui"
+  ```
+
+- #### Optional. Pull TGI Docker Image (Do this if you want to use TGI)
+
+  ```bash
+  docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  ```
+
+- #### Build Docker Images
+
+  ```bash
+  docker compose -f build.yaml build ${service_list} --no-cache
+  ```
+
+  After the build, we check the list of images with the command:
+
+  ```bash
+  docker image ls
+  ```
+
+  The list of images should include:
+
+  ##### vLLM-based application:
+
+  - opea/vllm-rocm:latest
+    - opea/whisper:latest
+    - opea/speecht5:latest
+    - opea/audioqna:latest
+
+  ##### TGI-based application:
+
+  - ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    - opea/whisper:latest
+    - opea/speecht5:latest
+    - opea/audioqna:latest
+
+---
+
+## Deploy the AudioQnA Application
+
+### Docker Compose Configuration for AMD GPUs
+
+To enable GPU support for AMD GPUs, the following configuration is added to the Docker Compose file:
+
+- compose_vllm.yaml - for vLLM-based application
+- compose.yaml - for TGI-based

 ```yaml
-# Example for vLLM service in compose_vllm.yaml
-# Note: Modern docker compose might use deploy.resources syntax instead.
-# Check your docker version and compose file.
 shm_size: 1g
 devices:
  - /dev/kfd:/dev/kfd
  - /dev/dri/:/dev/dri/
-# - /dev/dri/render128:/dev/dri/render128
 cap_add:
  - SYS_PTRACE
 group_add:
@@ -62,161 +123,131 @@ security_opt:
  - seccomp:unconfined
 ```

-#### Environment Variables (`set_env*.sh`)
+This configuration forwards all available GPUs to the container. To use a specific GPU, specify its `cardN` and `renderN` device IDs. For example:

-These scripts (`set_env_vllm.sh` for vLLM, `set_env.sh` for TGI) configure crucial parameters passed to the containers.
+```yaml
+shm_size: 1g
+devices:
+  - /dev/kfd:/dev/kfd
+  - /dev/dri/card0:/dev/dri/card0
+  - /dev/dri/render128:/dev/dri/render128
+cap_add:
+  - SYS_PTRACE
+group_add:
+  - video
+security_opt:
+  - seccomp:unconfined
+```

-To set up environment variables for deploying AudioQnA services, set up some parameters specific to the deployment environment and source the `set_env.sh` script in this directory:
+**How to Identify GPU Device IDs:**
+Use AMD GPU driver utilities to determine the correct `cardN` and `renderN` IDs for your GPU.

-For TGI inference usage:
+### Set deploy environment variables
+
+#### Setting variables in the operating system environment:
+
+##### Set variable HUGGINGFACEHUB_API_TOKEN:

 ```bash
-export host_ip="External_Public_IP"           # ip address of the node
-export HUGGINGFACEHUB_API_TOKEN="Your_HuggingFace_API_Token"
-export http_proxy="Your_HTTP_Proxy"           # http proxy if any
-export https_proxy="Your_HTTPs_Proxy"         # https proxy if any
-export no_proxy=localhost,127.0.0.1,$host_ip,whisper-service,speecht5-service,vllm-service,tgi-service,audioqna-xeon-backend-server,audioqna-xeon-ui-server  # additional no proxies if needed
-export NGINX_PORT=${your_nginx_port}          # your usable port for nginx, 80 for example
-source ./set_env.sh
+### Replace the string 'your_huggingfacehub_token' with your HuggingFacehub repository access token.
+export HUGGINGFACEHUB_API_TOKEN='your_huggingfacehub_token'
 ```

-For vLLM inference usage
+#### Set variables value in set_env\*\*\*\*.sh file:
+
+Go to Docker Compose directory:

 ```bash
-export host_ip="External_Public_IP"           # ip address of the node
-export HUGGINGFACEHUB_API_TOKEN="Your_HuggingFace_API_Token"
-export http_proxy="Your_HTTP_Proxy"           # http proxy if any
-export https_proxy="Your_HTTPs_Proxy"         # https proxy if any
-export no_proxy=localhost,127.0.0.1,$host_ip,whisper-service,speecht5-service,vllm-service,tgi-service,audioqna-xeon-backend-server,audioqna-xeon-ui-server  # additional no proxies if needed
-export NGINX_PORT=${your_nginx_port}          # your usable port for nginx, 80 for example
-source ./set_env_vllm.sh
+cd ~/audioqna-install/GenAIExamples/AudioQnA/docker_compose/amd/gpu/rocm
 ```

-### Deploy the Services Using Docker Compose
-
-To deploy the AudioQnA services, execute the `docker compose up` command with the appropriate arguments. For a default deployment, execute the command below. It uses the 'compose.yaml' file.
-
-for TGI inference deployment
-
-```bash
-cd docker_compose/amd/gpu/rocm
-docker compose -f compose.yaml up -d
-```
-
-for vLLM inference deployment
-
-```bash
-cd docker_compose/amd/gpu/rocm
-docker compose -f compose_vllm.yaml up -d
-```
-
-> **Note**: developers should build docker image from source when:
->
-> - Developing off the git main branch (as the container's ports in the repo may be different > from the published docker image).
-> - Unable to download the docker image.
-> - Use a specific version of Docker image.
-
-Please refer to the table below to build different microservices from source:
-
-| Microservice | Deployment Guide                                                                                                                  |
-| ------------ | --------------------------------------------------------------------------------------------------------------------------------- |
-| vLLM         | [vLLM build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/third_parties/vllm#build-docker)                    |
-| LLM          | [LLM build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/llms)                                                |
-| WHISPER      | [Whisper build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/asr/src#211-whisper-server-image)                |
-| SPEECHT5     | [SpeechT5 build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/tts/src#211-speecht5-server-image)              |
-| GPT-SOVITS   | [GPT-SOVITS build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/third_parties/gpt-sovits/src#build-the-image) |
-| MegaService  | [MegaService build guide](../../../../README_miscellaneous.md#build-megaservice-docker-image)                                     |
-| UI           | [Basic UI build guide](../../../../README_miscellaneous.md#build-ui-docker-image)                                                 |
-
-### Check the Deployment Status
-
-After running docker compose, check if all the containers launched via docker compose have started:
-
-#### For TGI inference deployment
-
-```bash
-docker ps -a
-```
-
-For the default deployment, the following 5 containers should have started:
-
-```
-CONTAINER ID   IMAGE                                                      COMMAND                  CREATED          STATUS          PORTS                                         NAMES
-d8007690868d   opea/audioqna:latest                                       "python audioqna.py"     21 seconds ago   Up 19 seconds   0.0.0.0:3008->8888/tcp, [::]:3008->8888/tcp   audioqna-rocm-backend-server
-87ba9a1d56ae   ghcr.io/huggingface/text-generation-inference:2.4.1-rocm   "/tgi-entrypoint.sh …"   21 seconds ago   Up 20 seconds   0.0.0.0:3006->80/tcp, [::]:3006->80/tcp       tgi-service
-59e869acd742   opea/speecht5:latest                                       "python speecht5_ser…"   21 seconds ago   Up 20 seconds   0.0.0.0:7055->7055/tcp, :::7055->7055/tcp     speecht5-service
-0143267a4327   opea/whisper:latest                                        "python whisper_serv…"   21 seconds ago   Up 20 seconds   0.0.0.0:7066->7066/tcp, :::7066->7066/tcp     whisper-service
-```
-
-### For vLLM inference deployment
-
-```bash
-docker ps -a
-```
-
-For the default deployment, the following 5 containers should have started:
-
-```
-CONTAINER ID   IMAGE                     COMMAND                  CREATED          STATUS          PORTS                                           NAMES
-f3e6893a69fa   opea/audioqna-ui:latest   "docker-entrypoint.s…"   37 seconds ago   Up 35 seconds   0.0.0.0:18039->5173/tcp, [::]:18039->5173/tcp   audioqna-ui-server
-f943e5cd21e9   opea/audioqna:latest      "python audioqna.py"     37 seconds ago   Up 35 seconds   0.0.0.0:18038->8888/tcp, [::]:18038->8888/tcp   audioqna-backend-server
-074e8c418f52   opea/speecht5:latest      "python speecht5_ser…"   37 seconds ago   Up 36 seconds   0.0.0.0:7055->7055/tcp, :::7055->7055/tcp       speecht5-service
-77abe498e427   opea/vllm-rocm:latest     "python3 /workspace/…"   37 seconds ago   Up 36 seconds   0.0.0.0:8081->8011/tcp, [::]:8081->8011/tcp     audioqna-vllm-service
-9074a95bb7a6   opea/whisper:latest       "python whisper_serv…"   37 seconds ago   Up 36 seconds   0.0.0.0:7066->7066/tcp, :::7066->7066/tcp       whisper-service
-```
-
-If any issues are encountered during deployment, refer to the [Troubleshooting](../../../../README_miscellaneous.md#troubleshooting) section.
-
-### Validate the Pipeline
-
-Once the AudioQnA services are running, test the pipeline using the following command:
-
-```bash
-# Test the AudioQnA megaservice by recording a .wav file, encoding the file into the base64 format, and then sending the base64 string to the megaservice endpoint.
-# The megaservice will return a spoken response as a base64 string. To listen to the response, decode the base64 string and save it as a .wav file.
-wget https://github.com/intel/intel-extension-for-transformers/raw/refs/heads/main/intel_extension_for_transformers/neural_chat/assets/audio/sample_2.wav
-base64_audio=$(base64 -w 0 sample_2.wav)
-
-# if you are using speecht5 as the tts service, voice can be "default" or "male"
-# if you are using gpt-sovits for the tts service, you can set the reference audio following https://github.com/opea-project/GenAIComps/blob/main/comps/third_parties/gpt-sovits/src/README.md
-
-curl http://${host_ip}:3008/v1/audioqna \
-  -X POST \
-  -H "Content-Type: application/json" \
-  -d "{\"audio\": \"${base64_audio}\", \"max_tokens\": 64, \"voice\": \"default\"}" \
-  | sed 's/^"//;s/"$//' | base64 -d > output.wav
-```
-
-**Note** : Access the AudioQnA UI by web browser through this URL: `http://${host_ip}:5173`. Please confirm the `5173` port is opened in the firewall. To validate each microservice used in the pipeline refer to the [Validate Microservices](#validate-microservices) section.
-
-### Cleanup the Deployment
-
-To stop the containers associated with the deployment, execute the following command:
+The example uses the Nano text editor. You can use any convenient text editor:

 #### If you use vLLM

 ```bash
-cd ~/audioqna-install/GenAIExamples/AudioQnA/docker_compose/amd/gpu/rocm
-docker compose -f compose_vllm.yaml down
+nano set_env_vllm.sh
 ```

 #### If you use TGI

 ```bash
-cd ~/audioqna-install/GenAIExamples/AudioQnA/docker_compose/amd/gpu/rocm
-docker compose -f compose.yaml down
+nano set_env.sh
 ```

-## AudioQnA Docker Compose Files
+If you are in a proxy environment, also set the proxy-related environment variables:

-In the context of deploying an AudioQnA pipeline on an Intel® Xeon® platform, we can pick and choose different large language model serving frameworks, or single English TTS/multi-language TTS component. The table below outlines the various configurations that are available as part of the application. These configurations can be used as templates and can be extended to different components available in [GenAIComps](https://github.com/opea-project/GenAIComps.git).
+```bash
+export http_proxy="Your_HTTP_Proxy"
+export https_proxy="Your_HTTPs_Proxy"
+```

-| File                                     | Description                                                                               |
-| ---------------------------------------- | ----------------------------------------------------------------------------------------- |
-| [compose_vllm.yaml](./compose_vllm.yaml) | Default compose file using vllm as serving framework and redis as vector database         |
-| [compose.yaml](./compose.yaml)           | The LLM serving framework is TGI. All other configurations remain the same as the default |
+Set the values of the variables:

-### Validate the vLLM/TGI Service
+- **HOST_IP, HOST_IP_EXTERNAL** - These variables are used to configure the name/address of the service in the operating system environment for the application services to interact with each other and with the outside world.
+
+  If your server uses only an internal address and is not accessible from the Internet, then the values for these two variables will be the same and the value will be equal to the server's internal name/address.
+
+  If your server uses only an external, Internet-accessible address, then the values for these two variables will be the same and the value will be equal to the server's external name/address.
+
+  If your server is located on an internal network, has an internal address, but is accessible from the Internet via a proxy/firewall/load balancer, then the HOST_IP variable will have a value equal to the internal name/address of the server, and the EXTERNAL_HOST_IP variable will have a value equal to the external name/address of the proxy/firewall/load balancer behind which the server is located.
+
+  We set these values in the file set_env\*\*\*\*.sh
+
+- **Variables with names like "**\*\*\*\*\*\*\_PORT"\*\* - These variables set the IP port numbers for establishing network connections to the application services.
+  The values shown in the file set_env.sh or set_env_vllm they are the values used for the development and testing of the application, as well as configured for the environment in which the development is performed. These values must be configured in accordance with the rules of network access to your environment's server, and must not overlap with the IP ports of other applications that are already in use.
+
+#### Set variables with script set_env\*\*\*\*.sh
+
+#### If you use vLLM
+
+```bash
+. set_env_vllm.sh
+```
+
+#### If you use TGI
+
+```bash
+. set_env.sh
+```
+
+### Start the services:
+
+#### If you use vLLM
+
+```bash
+docker compose -f compose_vllm.yaml up -d
+```
+
+#### If you use TGI
+
+```bash
+docker compose -f compose.yaml up -d
+```
+
+All containers should be running and should not restart:
+
+##### If you use vLLM:
+
+- audioqna-vllm-service
+- whisper-service
+- speecht5-service
+- audioqna-backend-server
+- audioqna-ui-server
+
+##### If you use TGI:
+
+- audioqna-tgi-service
+- whisper-service
+- speecht5-service
+- audioqna-backend-server
+- audioqna-ui-server
+
+---
+
+## Validate the Services
+
+### 1. Validate the vLLM/TGI Service

 #### If you use vLLM:

@@ -282,7 +313,7 @@ Checking the response from the service. The response should be similar to JSON:
 If the service response has a meaningful response in the value of the "generated_text" key,
 then we consider the TGI service to be successfully launched

-### Validate MegaServices
+### 2. Validate MegaServices

 Test the AudioQnA megaservice by recording a .wav file, encoding the file into the base64 format, and then sending the
 base64 string to the megaservice endpoint. The megaservice will return a spoken response as a base64 string. To listen
@@ -296,7 +327,7 @@ curl http://${host_ip}:3008/v1/audioqna \
  -H 'Content-Type: application/json' | sed 's/^"//;s/"$//' | base64 -d > output.wav
 ```

-### Validate MicroServices
+### 3. Validate MicroServices

 ```bash
 # whisper service
@@ -312,6 +343,18 @@ curl http://${host_ip}:7055/v1/tts \
  -H 'Content-Type: application/json'
 ```

-## Conclusion
+### 4. Stop application

-This guide should enable developers to deploy the default configuration or any of the other compose yaml files for different configurations. It also highlights the configurable parameters that can be set before deployment.
+#### If you use vLLM
+
+```bash
+cd ~/audioqna-install/GenAIExamples/AudioQnA/docker_compose/amd/gpu/rocm
+docker compose -f compose_vllm.yaml down
+```
+
+#### If you use TGI
+
+```bash
+cd ~/audioqna-install/GenAIExamples/AudioQnA/docker_compose/amd/gpu/rocm
+docker compose -f compose.yaml down
+```
--- a/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -30,7 +30,7 @@ services:
    ports:
      - "3006:80"
    volumes:
-     - "${MODEL_CACHE:-./data}:/data"
+     - "./data:/data"
    shm_size: 1g
    devices:
      - /dev/kfd:/dev/kfd
--- a/AudioQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/README.md
@@ -1,146 +1,123 @@
-# Deploying AudioQnA on Intel® Xeon® Processors
+# Build Mega Service of AudioQnA on Xeon

-This document outlines the single node deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservices on Intel Xeon server. The steps include pulling Docker images, container deployment via Docker Compose, and service execution using microservices `llm`.
+This document outlines the deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server.
+
+The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [Start the MegaService](#-start-the-megaservice) section in this page.

 Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).

-## Table of Contents
+## 🚀 Build Docker images

-1. [AudioQnA Quick Start Deployment](#audioqna-quick-start-deployment)
-2. [AudioQnA Docker Compose Files](#audioqna-docker-compose-files)
-3. [Validate Microservices](#validate-microservices)
-4. [Conclusion](#conclusion)
+### 1. Source Code install GenAIComps

-## AudioQnA Quick Start Deployment
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+```

-This section describes how to quickly deploy and test the AudioQnA service manually on an Intel® Xeon® processor. The basic steps are:
+### 2. Build ASR Image

-1. [Access the Code](#access-the-code)
-2. [Configure the Deployment Environment](#configure-the-deployment-environment)
-3. [Deploy the Services Using Docker Compose](#deploy-the-services-using-docker-compose)
-4. [Check the Deployment Status](#check-the-deployment-status)
-5. [Validate the Pipeline](#validate-the-pipeline)
-6. [Cleanup the Deployment](#cleanup-the-deployment)
+```bash
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .
+```

-### Access the Code
+### 3. Build vLLM Image

-Clone the GenAIExample repository and access the AudioQnA Intel® Xeon® platform Docker Compose files and supporting scripts:
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd ./vllm/
+VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+git checkout ${VLLM_VER}
+docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.cpu -t opea/vllm:latest --shm-size=128g .
+```
+
+### 4. Build TTS Image
+
+```bash
+docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile .
+
+# multilang tts (optional)
+docker build -t opea/gpt-sovits:latest --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -f comps/tts/src/integrations/dependency/gpt-sovits/Dockerfile .
+```
+
+### 5. Build MegaService Docker Image
+
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `audioqna.py` Python script. Build the MegaService Docker image using the command below:

 ```bash
 git clone https://github.com/opea-project/GenAIExamples.git
-cd GenAIExamples/AudioQnA
+cd GenAIExamples/AudioQnA/
+docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
 ```

-Then checkout a released version, such as v1.2:
+Then run the command `docker images`, you will have following images ready:
+
+1. `opea/whisper:latest`
+2. `opea/vllm:latest`
+3. `opea/speecht5:latest`
+4. `opea/audioqna:latest`
+5. `opea/gpt-sovits:latest` (optional)
+
+## 🚀 Set the environment variables
+
+Before starting the services with `docker compose`, you have to recheck the following environment variables.

 ```bash
-git checkout v1.2
+export host_ip=<your External Public IP>    # export host_ip=$(hostname -I | awk '{print $1}')
+export HUGGINGFACEHUB_API_TOKEN=<your HF token>
+
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export WHISPER_SERVER_HOST_IP=${host_ip}
+export SPEECHT5_SERVER_HOST_IP=${host_ip}
+export LLM_SERVER_HOST_IP=${host_ip}
+export GPT_SOVITS_SERVER_HOST_IP=${host_ip}
+
+export WHISPER_SERVER_PORT=7066
+export SPEECHT5_SERVER_PORT=7055
+export GPT_SOVITS_SERVER_PORT=9880
+export LLM_SERVER_PORT=3006
+
+export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
 ```

-### Configure the Deployment Environment
+or use set_env.sh file to setup environment variables.

-To set up environment variables for deploying AudioQnA services, set up some parameters specific to the deployment environment and source the `set_env.sh` script in this directory:
+Note:
+
+- Please replace with host_ip with your external IP address, do not use localhost.
+- If you are in a proxy environment, also set the proxy-related environment variables:
+
+```
+export http_proxy="Your_HTTP_Proxy"
+export https_proxy="Your_HTTPs_Proxy"
+# Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+export no_proxy="Your_No_Proxy",${host_ip},whisper-service,speecht5-service,gpt-sovits-service,tgi-service,vllm-service,audioqna-xeon-backend-server,audioqna-xeon-ui-server
+```
+
+## 🚀 Start the MegaService

 ```bash
-export host_ip="External_Public_IP"           # ip address of the node
-export HUGGINGFACEHUB_API_TOKEN="Your_HuggingFace_API_Token"
-export http_proxy="Your_HTTP_Proxy"           # http proxy if any
-export https_proxy="Your_HTTPs_Proxy"         # https proxy if any
-export no_proxy=localhost,127.0.0.1,$host_ip,whisper-service,speecht5-service,vllm-service,tgi-service,audioqna-xeon-backend-server,audioqna-xeon-ui-server  # additional no proxies if needed
-export NGINX_PORT=${your_nginx_port}          # your usable port for nginx, 80 for example
-source ./set_env.sh
+cd GenAIExamples/AudioQnA/docker_compose/intel/cpu/xeon/
 ```

-Consult the section on [AudioQnA Service configuration](#audioqna-configuration) for information on how service specific configuration parameters affect deployments.
-
-### Deploy the Services Using Docker Compose
-
-To deploy the AudioQnA services, execute the `docker compose up` command with the appropriate arguments. For a default deployment, execute the command below. It uses the 'compose.yaml' file.
-
-```bash
-cd docker_compose/intel/cpu/xeon
-docker compose -f compose.yaml up -d
-```
-
-> **Note**: developers should build docker image from source when:
->
-> - Developing off the git main branch (as the container's ports in the repo may be different > from the published docker image).
-> - Unable to download the docker image.
-> - Use a specific version of Docker image.
-
-Please refer to the table below to build different microservices from source:
-
-| Microservice | Deployment Guide                                                                                                                  |
-| ------------ | --------------------------------------------------------------------------------------------------------------------------------- |
-| vLLM         | [vLLM build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/third_parties/vllm#build-docker)                    |
-| LLM          | [LLM build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/llms)                                                |
-| WHISPER      | [Whisper build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/asr/src#211-whisper-server-image)                |
-| SPEECHT5     | [SpeechT5 build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/tts/src#211-speecht5-server-image)              |
-| GPT-SOVITS   | [GPT-SOVITS build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/third_parties/gpt-sovits/src#build-the-image) |
-| MegaService  | [MegaService build guide](../../../../README_miscellaneous.md#build-megaservice-docker-image)                                     |
-| UI           | [Basic UI build guide](../../../../README_miscellaneous.md#build-ui-docker-image)                                                 |
-
-### Check the Deployment Status
-
-After running docker compose, check if all the containers launched via docker compose have started:
-
-```bash
-docker ps -a
-```
-
-For the default deployment, the following 5 containers should have started:
+If use vLLM as the LLM serving backend:

 ```
-1c67e44c39d2   opea/audioqna-ui:latest   "docker-entrypoint.s…"   About a minute ago   Up About a minute             0.0.0.0:5173->5173/tcp, :::5173->5173/tcp   audioqna-xeon-ui-server
-833a42677247   opea/audioqna:latest      "python audioqna.py"     About a minute ago   Up About a minute             0.0.0.0:3008->8888/tcp, :::3008->8888/tcp   audioqna-xeon-backend-server
-5dc4eb9bf499   opea/speecht5:latest      "python speecht5_ser…"   About a minute ago   Up About a minute             0.0.0.0:7055->7055/tcp, :::7055->7055/tcp   speecht5-service
-814e6efb1166   opea/vllm:latest          "python3 -m vllm.ent…"   About a minute ago   Up About a minute (healthy)   0.0.0.0:3006->80/tcp, :::3006->80/tcp       vllm-service
-46f7a00f4612   opea/whisper:latest       "python whisper_serv…"   About a minute ago   Up About a minute             0.0.0.0:7066->7066/tcp, :::7066->7066/tcp   whisper-service
+docker compose up -d
+
+# multilang tts (optional)
+docker compose -f compose_multilang.yaml up -d
 ```

-If any issues are encountered during deployment, refer to the [Troubleshooting](../../../../README_miscellaneous.md#troubleshooting) section.
+If use TGI as the LLM serving backend:

-### Validate the Pipeline
-
-Once the AudioQnA services are running, test the pipeline using the following command:
-
-```bash
-# Test the AudioQnA megaservice by recording a .wav file, encoding the file into the base64 format, and then sending the base64 string to the megaservice endpoint.
-# The megaservice will return a spoken response as a base64 string. To listen to the response, decode the base64 string and save it as a .wav file.
-wget https://github.com/intel/intel-extension-for-transformers/raw/refs/heads/main/intel_extension_for_transformers/neural_chat/assets/audio/sample_2.wav
-base64_audio=$(base64 -w 0 sample_2.wav)
-
-# if you are using speecht5 as the tts service, voice can be "default" or "male"
-# if you are using gpt-sovits for the tts service, you can set the reference audio following https://github.com/opea-project/GenAIComps/blob/main/comps/third_parties/gpt-sovits/src/README.md
-
-curl http://${host_ip}:3008/v1/audioqna \
-  -X POST \
-  -H "Content-Type: application/json" \
-  -d "{\"audio\": \"${base64_audio}\", \"max_tokens\": 64, \"voice\": \"default\"}" \
-  | sed 's/^"//;s/"$//' | base64 -d > output.wav
+```
+docker compose -f compose_tgi.yaml up -d
 ```

-**Note** : Access the AudioQnA UI by web browser through this URL: `http://${host_ip}:5173`. Please confirm the `5173` port is opened in the firewall. To validate each microservice used in the pipeline refer to the [Validate Microservices](#validate-microservices) section.
-
-### Cleanup the Deployment
-
-To stop the containers associated with the deployment, execute the following command:
-
-```bash
-docker compose -f compose.yaml down
-```
-
-## AudioQnA Docker Compose Files
-
-In the context of deploying an AudioQnA pipeline on an Intel® Xeon® platform, we can pick and choose different large language model serving frameworks, or single English TTS/multi-language TTS component. The table below outlines the various configurations that are available as part of the application. These configurations can be used as templates and can be extended to different components available in [GenAIComps](https://github.com/opea-project/GenAIComps.git).
-
-| File                                               | Description                                                                               |
-| -------------------------------------------------- | ----------------------------------------------------------------------------------------- |
-| [compose.yaml](./compose.yaml)                     | Default compose file using vllm as serving framework and redis as vector database         |
-| [compose_tgi.yaml](./compose_tgi.yaml)             | The LLM serving framework is TGI. All other configurations remain the same as the default |
-| [compose_multilang.yaml](./compose_multilang.yaml) | The TTS component is GPT-SoVITS. All other configurations remain the same as the default  |
-
-## Validate MicroServices
+## 🚀 Test MicroServices

 1. Whisper Service

@@ -184,7 +161,7 @@ In the context of deploying an AudioQnA pipeline on an Intel® Xeon® platform,

 3. TTS Service

-   ```bash
+   ```
   # speecht5 service
   curl http://${host_ip}:${SPEECHT5_SERVER_PORT}/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3

@@ -192,6 +169,17 @@ In the context of deploying an AudioQnA pipeline on an Intel® Xeon® platform,
   curl http://${host_ip}:${GPT_SOVITS_SERVER_PORT}/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
   ```

-## Conclusion
+## 🚀 Test MegaService

-This guide should enable developers to deploy the default configuration or any of the other compose yaml files for different configurations. It also highlights the configurable parameters that can be set before deployment.
+Test the AudioQnA megaservice by recording a .wav file, encoding the file into the base64 format, and then sending the
+base64 string to the megaservice endpoint. The megaservice will return a spoken response as a base64 string. To listen
+to the response, decode the base64 string and save it as a .wav file.
+
+```bash
+# if you are using speecht5 as the tts service, voice can be "default" or "male"
+# if you are using gpt-sovits for the tts service, you can set the reference audio following https://github.com/opea-project/GenAIComps/blob/main/comps/tts/src/integrations/dependency/gpt-sovits/README.md
+curl http://${host_ip}:3008/v1/audioqna \
+  -X POST \
+  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64, "voice":"default"}' \
+  -H 'Content-Type: application/json' | sed 's/^"//;s/"$//' | base64 -d > output.wav
+```
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
@@ -24,9 +24,6 @@ services:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      llm_download: ${llm_download:-True}
-    # volumes:
-    #  - ./pretrained_models/:/home/user/GPT-SoVITS/GPT_SoVITS/pretrained_models/
    restart: unless-stopped
  vllm-service:
    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -1,170 +1,145 @@
-# Deploying AudioQnA on Intel® Gaudi® Processors
+# Build Mega Service of AudioQnA on Gaudi

-This document outlines the single node deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservices on Intel Gaudi server. The steps include pulling Docker images, container deployment via Docker Compose, and service execution using microservices `llm`.
+This document outlines the deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server.
+
+The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [Start the MegaService](#-start-the-megaservice) section in this page.

 Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).

-## Table of Contents
+## 🚀 Build Docker images

-1. [AudioQnA Quick Start Deployment](#audioqna-quick-start-deployment)
-2. [AudioQnA Docker Compose Files](#audioqna-docker-compose-files)
-3. [Validate Microservices](#validate-microservices)
-4. [Conclusion](#conclusion)
+### 1. Source Code install GenAIComps

-## AudioQnA Quick Start Deployment
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+```

-This section describes how to quickly deploy and test the AudioQnA service manually on an Intel® Gaudi® processor. The basic steps are:
+### 2. Build ASR Image

-1. [Access the Code](#access-the-code)
-2. [Configure the Deployment Environment](#configure-the-deployment-environment)
-3. [Deploy the Services Using Docker Compose](#deploy-the-services-using-docker-compose)
-4. [Check the Deployment Status](#check-the-deployment-status)
-5. [Validate the Pipeline](#validate-the-pipeline)
-6. [Cleanup the Deployment](#cleanup-the-deployment)
+```bash
+docker build -t opea/whisper-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile.intel_hpu .
+```

-### Access the Code
+### 3. Build vLLM Image

-Clone the GenAIExample repository and access the AudioQnA Intel® Gaudi® platform Docker Compose files and supporting scripts:
+git clone https://github.com/HabanaAI/vllm-fork.git
+cd vllm-fork/
+VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+git checkout ${VLLM_VER}
+docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g .
+
+### 4. Build TTS Image
+
+```bash
+docker build -t opea/speecht5-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile.intel_hpu .
+```
+
+### 5. Build MegaService Docker Image
+
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `audioqna.py` Python script. Build the MegaService Docker image using the command below:

 ```bash
 git clone https://github.com/opea-project/GenAIExamples.git
-cd GenAIExamples/AudioQnA
+cd GenAIExamples/AudioQnA/
+docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
 ```

-Then checkout a released version, such as v1.2:
+Then run the command `docker images`, you will have following images ready:
+
+1. `opea/whisper-gaudi:latest`
+2. `opea/vllm-gaudi:latest`
+3. `opea/speecht5-gaudi:latest`
+4. `opea/audioqna:latest`
+
+## 🚀 Set the environment variables
+
+Before starting the services with `docker compose`, you have to recheck the following environment variables.

 ```bash
-git checkout v1.2
+export host_ip=<your External Public IP>    # export host_ip=$(hostname -I | awk '{print $1}')
+export HUGGINGFACEHUB_API_TOKEN=<your HF token>
+
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+# set vLLM parameters
+export NUM_CARDS=1
+export BLOCK_SIZE=128
+export MAX_NUM_SEQS=256
+export MAX_SEQ_LEN_TO_CAPTURE=2048
+
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export WHISPER_SERVER_HOST_IP=${host_ip}
+export SPEECHT5_SERVER_HOST_IP=${host_ip}
+export LLM_SERVER_HOST_IP=${host_ip}
+
+export WHISPER_SERVER_PORT=7066
+export SPEECHT5_SERVER_PORT=7055
+export LLM_SERVER_PORT=3006
+
+export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
 ```

-### Configure the Deployment Environment
+or use set_env.sh file to setup environment variables.

-To set up environment variables for deploying AudioQnA services, set up some parameters specific to the deployment environment and source the `set_env.sh` script in this directory:
+Note:
+
+- Please replace with host_ip with your external IP address, do not use localhost.
+- If you are in a proxy environment, also set the proxy-related environment variables:
+
+```
+export http_proxy="Your_HTTP_Proxy"
+export https_proxy="Your_HTTPs_Proxy"
+# Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+export no_proxy="Your_No_Proxy",${host_ip},whisper-service,speecht5-service,tgi-service,vllm-service,audioqna-gaudi-backend-server,audioqna-gaudi-ui-server
+```
+
+## 🚀 Start the MegaService
+
+> **_NOTE:_** Users will need at least three Gaudi cards for AudioQnA.

 ```bash
-export host_ip="External_Public_IP"           # ip address of the node
-export HUGGINGFACEHUB_API_TOKEN="Your_HuggingFace_API_Token"
-export http_proxy="Your_HTTP_Proxy"           # http proxy if any
-export https_proxy="Your_HTTPs_Proxy"         # https proxy if any
-export no_proxy=localhost,127.0.0.1,$host_ip,whisper-service,speecht5-service,vllm-service,tgi-service,audioqna-gaudi-backend-server,audioqna-gaudi-ui-server  # additional no proxies if needed
-export NGINX_PORT=${your_nginx_port}          # your usable port for nginx, 80 for example
-source ./set_env.sh
+cd GenAIExamples/AudioQnA/docker_compose/intel/hpu/gaudi/
 ```

-Consult the section on [AudioQnA Service configuration](#audioqna-configuration) for information on how service specific configuration parameters affect deployments.
-
-### Deploy the Services Using Docker Compose
-
-To deploy the AudioQnA services, execute the `docker compose up` command with the appropriate arguments. For a default deployment, execute the command below. It uses the 'compose.yaml' file.
-
-```bash
-cd docker_compose/intel/hpu/gaudi
-docker compose -f compose.yaml up -d
-```
-
-> **Note**: developers should build docker image from source when:
->
-> - Developing off the git main branch (as the container's ports in the repo may be different > from the published docker image).
-> - Unable to download the docker image.
-> - Use a specific version of Docker image.
-
-Please refer to the table below to build different microservices from source:
-
-| Microservice | Deployment Guide                                                                                                     |
-| ------------ | -------------------------------------------------------------------------------------------------------------------- |
-| vLLM-gaudi   | [vLLM build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/third_parties/vllm#build-docker-1)     |
-| LLM          | [LLM build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/llms)                                   |
-| WHISPER      | [Whisper build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/asr/src#211-whisper-server-image)   |
-| SPEECHT5     | [SpeechT5 build guide](https://github.com/opea-project/GenAIComps/tree/main/comps/tts/src#211-speecht5-server-image) |
-| MegaService  | [MegaService build guide](../../../../README_miscellaneous.md#build-megaservice-docker-image)                        |
-| UI           | [Basic UI build guide](../../../../README_miscellaneous.md#build-ui-docker-image)                                    |
-
-### Check the Deployment Status
-
-After running docker compose, check if all the containers launched via docker compose have started:
-
-```bash
-docker ps -a
-```
-
-For the default deployment, the following 5 containers should have started:
+If use vLLM as the LLM serving backend:

 ```
-23f27dab14a5   opea/whisper-gaudi:latest                                                                   "python whisper_serv…"   18 minutes ago   Up 18 minutes             0.0.0.0:7066->7066/tcp, :::7066->7066/tcp                                              whisper-service
-629da06b7fb2   opea/audioqna-ui:latest                                                                     "docker-entrypoint.s…"   19 minutes ago   Up 18 minutes             0.0.0.0:5173->5173/tcp, :::5173->5173/tcp                                              audioqna-gaudi-ui-server
-8a74d9806b87   opea/audioqna:latest                                                                        "python audioqna.py"     19 minutes ago   Up 18 minutes             0.0.0.0:3008->8888/tcp, [::]:3008->8888/tcp                                            audioqna-gaudi-backend-server
-29324430f42e   opea/vllm-gaudi:latest                                                                      "python3 -m vllm.ent…"   19 minutes ago   Up 19 minutes (healthy)   0.0.0.0:3006->80/tcp, [::]:3006->80/tcp                                                vllm-gaudi-service
-dbd585f0a95a   opea/speecht5-gaudi:latest                                                                  "python speecht5_ser…"   19 minutes ago   Up 19 minutes             0.0.0.0:7055->7055/tcp, :::7055->7055/tcp                                              speecht5-service
+docker compose up -d
 ```

-If any issues are encountered during deployment, refer to the [Troubleshooting](../../../../README_miscellaneous.md#troubleshooting) section.
+If use TGI as the LLM serving backend:

-### Validate the Pipeline
-
-Once the AudioQnA services are running, test the pipeline using the following command:
-
-```bash
-# Test the AudioQnA megaservice by recording a .wav file, encoding the file into the base64 format, and then sending the base64 string to the megaservice endpoint.
-# The megaservice will return a spoken response as a base64 string. To listen to the response, decode the base64 string and save it as a .wav file.
-wget https://github.com/intel/intel-extension-for-transformers/raw/refs/heads/main/intel_extension_for_transformers/neural_chat/assets/audio/sample_2.wav
-base64_audio=$(base64 -w 0 sample_2.wav)
-
-# if you are using speecht5 as the tts service, voice can be "default" or "male"
-
-curl http://${host_ip}:3008/v1/audioqna \
-  -X POST \
-  -H "Content-Type: application/json" \
-  -d "{\"audio\": \"${base64_audio}\", \"max_tokens\": 64, \"voice\": \"default\"}" \
-  | sed 's/^"//;s/"$//' | base64 -d > output.wav
+```
+docker compose -f compose_tgi.yaml up -d
 ```

-**Note** : Access the AudioQnA UI by web browser through this URL: `http://${host_ip}:5173`. Please confirm the `5173` port is opened in the firewall. To validate each microservice used in the pipeline refer to the [Validate Microservices](#validate-microservices) section.
-
-### Cleanup the Deployment
-
-To stop the containers associated with the deployment, execute the following command:
-
-```bash
-docker compose -f compose.yaml down
-```
-
-## AudioQnA Docker Compose Files
-
-In the context of deploying an AudioQnA pipeline on an Intel® Gaudi® platform, we can pick and choose different large language model serving frameworks. The table below outlines the various configurations that are available as part of the application. These configurations can be used as templates and can be extended to different components available in [GenAIComps](https://github.com/opea-project/GenAIComps.git).
-
-| File                                   | Description                                                                               |
-| -------------------------------------- | ----------------------------------------------------------------------------------------- |
-| [compose.yaml](./compose.yaml)         | Default compose file using vllm as serving framework and redis as vector database         |
-| [compose_tgi.yaml](./compose_tgi.yaml) | The LLM serving framework is TGI. All other configurations remain the same as the default |
-
-## Validate MicroServices
+## 🚀 Test MicroServices

 1. Whisper Service

   ```bash
-   wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
-   curl http://${host_ip}:${WHISPER_SERVER_PORT}/v1/audio/transcriptions \
-     -H "Content-Type: multipart/form-data" \
-     -F file="@./sample.wav" \
-     -F model="openai/whisper-small"
+   curl http://${host_ip}:${WHISPER_SERVER_PORT}/v1/asr \
+     -X POST \
+     -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
+     -H 'Content-Type: application/json'
   ```

 2. LLM backend Service

-   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready and the container (`vllm-service` or `tgi-service`) status shown via `docker ps` will be `healthy`. Before that, the status will be `health: starting`.
+   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready and the container (`vllm-gaudi-service` or `tgi-gaudi-service`) status shown via `docker ps` will be `healthy`. Before that, the status will be `health: starting`.

   Or try the command below to check whether the LLM serving is ready.

   ```bash
   # vLLM service
-   docker logs vllm-service 2>&1 | grep complete
+   docker logs vllm-gaudi-service 2>&1 | grep complete
   # If the service is ready, you will get the response like below.
   INFO:     Application startup complete.
   ```

   ```bash
   # TGI service
-   docker logs tgi-service | grep Connected
+   docker logs tgi-gaudi-service | grep Connected
   # If the service is ready, you will get the response like below.
   2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
   ```
@@ -181,11 +156,24 @@ In the context of deploying an AudioQnA pipeline on an Intel® Gaudi® platform,

 3. TTS Service

-   ```bash
+   ```
   # speecht5 service
-   curl http://${host_ip}:${SPEECHT5_SERVER_PORT}/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
+   curl http://${host_ip}:${SPEECHT5_SERVER_PORT}/v1/tts
+     -X POST \
+     -d '{"text": "Who are you?"}' \
+     -H 'Content-Type: application/json'
   ```

-## Conclusion
+## 🚀 Test MegaService

-This guide should enable developers to deploy the default configuration or any of the other compose yaml files for different configurations. It also highlights the configurable parameters that can be set before deployment.
+Test the AudioQnA megaservice by recording a .wav file, encoding the file into the base64 format, and then sending the
+base64 string to the megaservice endpoint. The megaservice will return a spoken response as a base64 string. To listen
+to the response, decode the base64 string and save it as a .wav file.
+
+```bash
+# voice can be "default" or "male"
+curl http://${host_ip}:3008/v1/audioqna \
+  -X POST \
+  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64, "voice":"default"}' \
+  -H 'Content-Type: application/json' | sed 's/^"//;s/"$//' | base64 -d > output.wav
+```
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -62,7 +62,7 @@ services:
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq-len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
  audioqna-gaudi-backend-server:
    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
    container_name: audioqna-gaudi-backend-server
--- a/AudioQnA/docker_image_build/build.yaml
+++ b/AudioQnA/docker_image_build/build.yaml
@@ -5,8 +5,6 @@ services:
  audioqna:
    build:
      args:
-        IMAGE_REPO: ${REGISTRY}
-        BASE_TAG: ${TAG}
        http_proxy: ${http_proxy}
        https_proxy: ${https_proxy}
        no_proxy: ${no_proxy}
@@ -28,13 +26,13 @@ services:
  whisper-gaudi:
    build:
      context: GenAIComps
-      dockerfile: comps/third_parties/whisper/src/Dockerfile.intel_hpu
+      dockerfile: comps/asr/src/integrations/dependency/whisper/Dockerfile.intel_hpu
    extends: audioqna
    image: ${REGISTRY:-opea}/whisper-gaudi:${TAG:-latest}
  whisper:
    build:
      context: GenAIComps
-      dockerfile: comps/third_parties/whisper/src/Dockerfile
+      dockerfile: comps/asr/src/integrations/dependency/whisper/Dockerfile
    extends: audioqna
    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
  asr:
@@ -52,13 +50,13 @@ services:
  speecht5-gaudi:
    build:
      context: GenAIComps
-      dockerfile: comps/third_parties/speecht5/src/Dockerfile.intel_hpu
+      dockerfile: comps/tts/src/integrations/dependency/speecht5/Dockerfile.intel_hpu
    extends: audioqna
    image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest}
  speecht5:
    build:
      context: GenAIComps
-      dockerfile: comps/third_parties/speecht5/src/Dockerfile
+      dockerfile: comps/tts/src/integrations/dependency/speecht5/Dockerfile
    extends: audioqna
    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
  tts:
@@ -70,13 +68,13 @@ services:
  gpt-sovits:
    build:
      context: GenAIComps
-      dockerfile: comps/third_parties/gpt-sovits/src/Dockerfile
+      dockerfile: comps/tts/src/integrations/dependency/gpt-sovits/Dockerfile
    extends: audioqna
    image: ${REGISTRY:-opea}/gpt-sovits:${TAG:-latest}
  vllm:
    build:
      context: vllm
-      dockerfile: docker/Dockerfile.cpu
+      dockerfile: Dockerfile.cpu
    extends: audioqna
    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
  vllm-gaudi:
@@ -87,7 +85,10 @@ services:
    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
  vllm-rocm:
    build:
+      args:
+        http_proxy: ${http_proxy}
+        https_proxy: ${https_proxy}
+        no_proxy: ${no_proxy}
      context: GenAIComps
      dockerfile: comps/third_parties/vllm/src/Dockerfile.amd_gpu
-    extends: audioqna
    image: ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest}
--- a/AudioQnA/kubernetes/helm/cpu-multilang-values.yaml
+++ b/AudioQnA/kubernetes/helm/cpu-multilang-values.yaml
@@ -1,15 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-tgi:
-  enabled: false
-vllm:
-  enabled: true
-
-speecht5:
-  enabled: false
-gpt-sovits:
-  enabled: true
-
-image:
-  repository: opea/audioqna-multilang
--- a/AudioQnA/kubernetes/helm/cpu-tgi-values.yaml
+++ b/AudioQnA/kubernetes/helm/cpu-tgi-values.yaml
@@ -1,12 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-tgi:
-  enabled: true
-vllm:
-  enabled: false
-
-speecht5:
-  enabled: true
-gpt-sovits:
-  enabled: false
--- a/AudioQnA/kubernetes/helm/cpu-values.yaml
+++ b/AudioQnA/kubernetes/helm/cpu-values.yaml
@@ -2,11 +2,4 @@
 # SPDX-License-Identifier: Apache-2.0

 tgi:
-  enabled: false
-vllm:
-  enabled: true
-
-speecht5:
-  enabled: true
-gpt-sovits:
-  enabled: false
+  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
--- a/AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml
+++ b/AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml
@@ -1,49 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-tgi:
-  enabled: true
-  accelDevice: "gaudi"
-  image:
-    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.3.1"
-  resources:
-    limits:
-      habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "1024"
-  MAX_TOTAL_TOKENS: "2048"
-  CUDA_GRAPHS: ""
-  HF_HUB_DISABLE_PROGRESS_BARS: 1
-  HF_HUB_ENABLE_HF_TRANSFER: 0
-  ENABLE_HPU_GRAPH: true
-  LIMIT_HPU_GRAPH: true
-  USE_FLASH_ATTENTION: true
-  FLASH_ATTENTION_RECOMPUTE: true
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
-vllm:
-  enabled: false
-
-whisper:
-  image:
-    repository: opea/whisper-gaudi
-  resources:
-    limits:
-      habana.ai/gaudi: 1
-
-speecht5:
-  enabled: true
-  image:
-    repository: opea/speecht5-gaudi
-  resources:
-    limits:
-      habana.ai/gaudi: 1
-gpt-sovits:
-  enabled: false
--- a/AudioQnA/kubernetes/helm/gaudi-values.yaml
+++ b/AudioQnA/kubernetes/helm/gaudi-values.yaml
@@ -2,27 +2,35 @@
 # SPDX-License-Identifier: Apache-2.0

 tgi:
-  enabled: false
-vllm:
-  enabled: true
  accelDevice: "gaudi"
  image:
-    repository: opea/vllm-gaudi
-  startupProbe:
-    failureThreshold: 360
-
-  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
-  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-
+    repository: ghcr.io/huggingface/tgi-gaudi
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
-  extraCmdArgs: [
-    "--tensor-parallel-size", "1",
-    "--block-size", "128",
-    "--max-num-seqs", "256",
-    "--max-seq-len-to-capture", "2048"
-  ]
+  MAX_INPUT_LENGTH: "1024"
+  MAX_TOTAL_TOKENS: "2048"
+  CUDA_GRAPHS: ""
+  HF_HUB_DISABLE_PROGRESS_BARS: 1
+  HF_HUB_ENABLE_HF_TRANSFER: 0
+  ENABLE_HPU_GRAPH: true
+  LIMIT_HPU_GRAPH: true
+  USE_FLASH_ATTENTION: true
+  FLASH_ATTENTION_RECOMPUTE: true
+  livenessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120

 whisper:
  image:
@@ -32,11 +40,8 @@ whisper:
      habana.ai/gaudi: 1

 speecht5:
-  enabled: true
  image:
    repository: opea/speecht5-gaudi
  resources:
    limits:
      habana.ai/gaudi: 1
-gpt-sovits:
-  enabled: false
--- a/AudioQnA/tests/test_compose_multilang_on_xeon.sh
+++ b/AudioQnA/tests/test_compose_multilang_on_xeon.sh
@@ -17,17 +17,23 @@ ip_address=$(hostname -I | awk '{print $1}')

 function build_docker_images() {
    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi

    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-    pushd GenAIComps
-    echo "GenAIComps test commit is $(git rev-parse HEAD)"
-    docker build --no-cache -t ${REGISTRY}/comps-base:${TAG} --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-    popd && sleep 1s

    git clone https://github.com/vllm-project/vllm.git
    cd ./vllm/
-    VLLM_VER="v0.8.3"
+    VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
    echo "Check out vLLM tag ${VLLM_VER}"
    git checkout ${VLLM_VER} &> /dev/null && cd ../

@@ -97,26 +103,14 @@ function stop_docker() {

 function main() {

-    echo "::group::stop_docker"
    stop_docker
-    echo "::endgroup::"
-
-    echo "::group::build_docker_images"
    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    echo "::endgroup::"
-
-    echo "::group::start_services"
    start_services
-    echo "::endgroup::"

-    echo "::group::validate_megaservice"
    validate_megaservice
-    echo "::endgroup::"

-    echo "::group::stop_docker"
    stop_docker
-    docker system prune -f
-    echo "::endgroup::"
+    echo y | docker system prune

 }

--- a/AudioQnA/tests/test_compose_on_gaudi.sh
+++ b/AudioQnA/tests/test_compose_on_gaudi.sh
@@ -17,17 +17,23 @@ ip_address=$(hostname -I | awk '{print $1}')

 function build_docker_images() {
    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi

    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-    pushd GenAIComps
-    echo "GenAIComps test commit is $(git rev-parse HEAD)"
-    docker build --no-cache -t ${REGISTRY}/comps-base:${TAG} --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-    popd && sleep 1s

    git clone https://github.com/HabanaAI/vllm-fork.git
    cd vllm-fork/
-    VLLM_VER=v0.6.6.post1+Gaudi-1.20.0
+    VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
    echo "Check out vLLM tag ${VLLM_VER}"
    git checkout ${VLLM_VER} &> /dev/null && cd ../

@@ -99,8 +105,34 @@ function validate_megaservice() {
        echo "Result wrong."
        exit 1
    fi
+
 }

+#function validate_frontend() {
+#    cd $WORKPATH/ui/svelte
+#    local conda_env_name="OPEA_e2e"
+#    export PATH=${HOME}/miniforge3/bin/:$PATH
+##    conda remove -n ${conda_env_name} --all -y
+##    conda create -n ${conda_env_name} python=3.12 -y
+#    source activate ${conda_env_name}
+#
+#    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+#
+##    conda install -c conda-forge nodejs=22.6.0 -y
+#    npm install && npm ci && npx playwright install --with-deps
+#    node -v && npm -v && pip list
+#
+#    exit_status=0
+#    npx playwright test || exit_status=$?
+#
+#    if [ $exit_status -ne 0 ]; then
+#        echo "[TEST INFO]: ---------frontend test failed---------"
+#        exit $exit_status
+#    else
+#        echo "[TEST INFO]: ---------frontend test passed---------"
+#    fi
+#}
+
 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
    docker compose -f compose.yaml stop && docker compose rm -f
@@ -108,26 +140,15 @@ function stop_docker() {

 function main() {

-    echo "::group::stop_docker"
    stop_docker
-    echo "::endgroup::"
-
-    echo "::group::build_docker_images"
    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    echo "::endgroup::"
-
-    echo "::group::start_services"
    start_services
-    echo "::endgroup::"

-    echo "::group::validate_megaservice"
    validate_megaservice
-    echo "::endgroup::"
+    # validate_frontend

-    echo "::group::stop_docker"
    stop_docker
-    docker system prune -f
-    echo "::endgroup::"
+    echo y | docker system prune

 }

--- a/AudioQnA/tests/test_compose_on_rocm.sh
+++ b/AudioQnA/tests/test_compose_on_rocm.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"/var/lib/GenAI/data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -18,18 +17,25 @@ export PATH="~/miniconda3/bin:$PATH"

 function build_docker_images() {
    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi

    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-    pushd GenAIComps
-    echo "GenAIComps test commit is $(git rev-parse HEAD)"
-    docker build --no-cache -t ${REGISTRY}/comps-base:${TAG} --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-    popd && sleep 1s

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    service_list="audioqna audioqna-ui whisper speecht5"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
+    echo "docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm"
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
    docker images && sleep 1s
 }

@@ -49,6 +55,8 @@ function start_services() {

    export BACKEND_SERVICE_ENDPOINT=http://${ip_address}:3008/v1/audioqna

+    # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+
    # Start Docker Containers
    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
@@ -79,6 +87,32 @@ function validate_megaservice() {

 }

+#function validate_frontend() {
+# Frontend tests are currently disabled
+#    cd $WORKPATH/ui/svelte
+#    local conda_env_name="OPEA_e2e"
+#    export PATH=${HOME}/miniforge3/bin/:$PATH
+##    conda remove -n ${conda_env_name} --all -y
+##    conda create -n ${conda_env_name} python=3.12 -y
+#    source activate ${conda_env_name}
+#
+#    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+#
+##    conda install -c conda-forge nodejs -y
+#    npm install && npm ci && npx playwright install --with-deps
+#    node -v && npm -v && pip list
+#
+#    exit_status=0
+#    npx playwright test || exit_status=$?
+#
+#    if [ $exit_status -ne 0 ]; then
+#        echo "[TEST INFO]: ---------frontend test failed---------"
+#        exit $exit_status
+#    else
+#        echo "[TEST INFO]: ---------frontend test passed---------"
+#    fi
+#}
+
 function stop_docker() {
    cd $WORKPATH/docker_compose/amd/gpu/rocm/
    docker compose stop && docker compose rm -f
@@ -86,26 +120,16 @@ function stop_docker() {

 function main() {

-    echo "::group::stop_docker"
    stop_docker
-    echo "::endgroup::"
-
-    echo "::group::build_docker_images"
    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    echo "::endgroup::"
-
-    echo "::group::start_services"
    start_services
-    echo "::endgroup::"

-    echo "::group::validate_megaservice"
    validate_megaservice
-    echo "::endgroup::"
+    # Frontend tests are currently disabled
+    # validate_frontend

-    echo "::group::stop_docker"
    stop_docker
-    docker system prune -f
-    echo "::endgroup::"
+    echo y | docker system prune

 }

--- a/AudioQnA/tests/test_compose_on_xeon.sh
+++ b/AudioQnA/tests/test_compose_on_xeon.sh
@@ -17,17 +17,23 @@ ip_address=$(hostname -I | awk '{print $1}')

 function build_docker_images() {
    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi

    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-    pushd GenAIComps
-    echo "GenAIComps test commit is $(git rev-parse HEAD)"
-    docker build --no-cache -t ${REGISTRY}/comps-base:${TAG} --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-    popd && sleep 1s

    git clone https://github.com/vllm-project/vllm.git
    cd ./vllm/
-    VLLM_VER="v0.8.3"
+    VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
    echo "Check out vLLM tag ${VLLM_VER}"
    git checkout ${VLLM_VER} &> /dev/null && cd ../

@@ -89,6 +95,31 @@ function validate_megaservice() {

 }

+#function validate_frontend() {
+#    cd $WORKPATH/ui/svelte
+#    local conda_env_name="OPEA_e2e"
+#    export PATH=${HOME}/miniforge3/bin/:$PATH
+##    conda remove -n ${conda_env_name} --all -y
+##    conda create -n ${conda_env_name} python=3.12 -y
+#    source activate ${conda_env_name}
+#
+#    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+#
+##    conda install -c conda-forge nodejs=22.6.0 -y
+#    npm install && npm ci && npx playwright install --with-deps
+#    node -v && npm -v && pip list
+#
+#    exit_status=0
+#    npx playwright test || exit_status=$?
+#
+#    if [ $exit_status -ne 0 ]; then
+#        echo "[TEST INFO]: ---------frontend test failed---------"
+#        exit $exit_status
+#    else
+#        echo "[TEST INFO]: ---------frontend test passed---------"
+#    fi
+#}
+
 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/cpu/xeon/
    docker compose -f compose.yaml stop && docker compose rm -f
@@ -96,26 +127,15 @@ function stop_docker() {

 function main() {

-    echo "::group::stop_docker"
    stop_docker
-    echo "::endgroup::"
-
-    echo "::group::build_docker_images"
    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    echo "::endgroup::"
-
-    echo "::group::start_services"
    start_services
-    echo "::endgroup::"

-    echo "::group::validate_megaservice"
    validate_megaservice
-    echo "::endgroup::"
+    # validate_frontend

-    echo "::group::stop_docker"
    stop_docker
-    docker system prune -f
-    echo "::endgroup::"
+    echo y | docker system prune

 }

--- a/AudioQnA/tests/test_compose_tgi_on_gaudi.sh
+++ b/AudioQnA/tests/test_compose_tgi_on_gaudi.sh
@@ -17,18 +17,25 @@ ip_address=$(hostname -I | awk '{print $1}')

 function build_docker_images() {
    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi

    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-    pushd GenAIComps
-    echo "GenAIComps test commit is $(git rev-parse HEAD)"
-    docker build --no-cache -t ${REGISTRY}/comps-base:${TAG} --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-    popd && sleep 1s

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
    docker images && sleep 1s
 }

@@ -48,6 +55,7 @@ function start_services() {

    export BACKEND_SERVICE_ENDPOINT=http://${ip_address}:3008/v1/audioqna
    export host_ip=${ip_address}
+    # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env

    # Start Docker Containers
    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
@@ -91,6 +99,31 @@ function validate_megaservice() {

 }

+#function validate_frontend() {
+#    cd $WORKPATH/ui/svelte
+#    local conda_env_name="OPEA_e2e"
+#    export PATH=${HOME}/miniforge3/bin/:$PATH
+##    conda remove -n ${conda_env_name} --all -y
+##    conda create -n ${conda_env_name} python=3.12 -y
+#    source activate ${conda_env_name}
+#
+#    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+#
+##    conda install -c conda-forge nodejs=22.6.0 -y
+#    npm install && npm ci && npx playwright install --with-deps
+#    node -v && npm -v && pip list
+#
+#    exit_status=0
+#    npx playwright test || exit_status=$?
+#
+#    if [ $exit_status -ne 0 ]; then
+#        echo "[TEST INFO]: ---------frontend test failed---------"
+#        exit $exit_status
+#    else
+#        echo "[TEST INFO]: ---------frontend test passed---------"
+#    fi
+#}
+
 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
    docker compose -f compose_tgi.yaml stop && docker compose rm -f
@@ -98,26 +131,15 @@ function stop_docker() {

 function main() {

-    echo "::group::stop_docker"
    stop_docker
-    echo "::endgroup::"
-
-    echo "::group::build_docker_images"
    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    echo "::endgroup::"
-
-    echo "::group::start_services"
    start_services
-    echo "::endgroup::"

-    echo "::group::validate_megaservice"
    validate_megaservice
-    echo "::endgroup::"
+    # validate_frontend

-    echo "::group::stop_docker"
    stop_docker
-    docker system prune -f
-    echo "::endgroup::"
+    echo y | docker system prune

 }

--- a/AudioQnA/tests/test_compose_tgi_on_xeon.sh
+++ b/AudioQnA/tests/test_compose_tgi_on_xeon.sh
@@ -17,18 +17,25 @@ ip_address=$(hostname -I | awk '{print $1}')

 function build_docker_images() {
    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi

    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-    pushd GenAIComps
-    echo "GenAIComps test commit is $(git rev-parse HEAD)"
-    docker build --no-cache -t ${REGISTRY}/comps-base:${TAG} --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-    popd && sleep 1s

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    service_list="audioqna audioqna-ui whisper speecht5"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
    docker images && sleep 1s
 }

@@ -49,6 +56,8 @@ function start_services() {
    export BACKEND_SERVICE_ENDPOINT=http://${ip_address}:3008/v1/audioqna
    export host_ip=${ip_address}

+    # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+
    # Start Docker Containers
    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
@@ -81,6 +90,31 @@ function validate_megaservice() {

 }

+#function validate_frontend() {
+#    cd $WORKPATH/ui/svelte
+#    local conda_env_name="OPEA_e2e"
+#    export PATH=${HOME}/miniforge3/bin/:$PATH
+##    conda remove -n ${conda_env_name} --all -y
+##    conda create -n ${conda_env_name} python=3.12 -y
+#    source activate ${conda_env_name}
+#
+#    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+#
+##    conda install -c conda-forge nodejs=22.6.0 -y
+#    npm install && npm ci && npx playwright install --with-deps
+#    node -v && npm -v && pip list
+#
+#    exit_status=0
+#    npx playwright test || exit_status=$?
+#
+#    if [ $exit_status -ne 0 ]; then
+#        echo "[TEST INFO]: ---------frontend test failed---------"
+#        exit $exit_status
+#    else
+#        echo "[TEST INFO]: ---------frontend test passed---------"
+#    fi
+#}
+
 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/cpu/xeon/
    docker compose -f compose_tgi.yaml stop && docker compose rm -f
@@ -88,26 +122,15 @@ function stop_docker() {

 function main() {

-    echo "::group::stop_docker"
    stop_docker
-    echo "::endgroup::"
-
-    echo "::group::build_docker_images"
    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    echo "::endgroup::"
-
-    echo "::group::start_services"
    start_services
-    echo "::endgroup::"

-    echo "::group::validate_megaservice"
    validate_megaservice
-    echo "::endgroup::"
+    # validate_frontend

-    echo "::group::stop_docker"
    stop_docker
-    docker system prune -f
-    echo "::endgroup::"
+    echo y | docker system prune

 }

--- a/AudioQnA/tests/test_compose_vllm_on_rocm.sh
+++ b/AudioQnA/tests/test_compose_vllm_on_rocm.sh
@@ -17,13 +17,19 @@ export PATH="~/miniconda3/bin:$PATH"

 function build_docker_images() {
    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi

    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-    pushd GenAIComps
-    echo "GenAIComps test commit is $(git rev-parse HEAD)"
-    docker build --no-cache -t ${REGISTRY}/comps-base:${TAG} --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-    popd && sleep 1s

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    service_list="audioqna audioqna-ui whisper speecht5 vllm-rocm"
@@ -86,6 +92,32 @@ function validate_megaservice() {

 }

+#function validate_frontend() {
+## Frontend tests are currently disabled
+#    cd $WORKPATH/ui/svelte
+#    local conda_env_name="OPEA_e2e"
+#    export PATH=${HOME}/miniforge3/bin/:$PATH
+##    conda remove -n ${conda_env_name} --all -y
+##    conda create -n ${conda_env_name} python=3.12 -y
+#    source activate ${conda_env_name}
+#
+#    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+#
+##    conda install -c conda-forge nodejs -y
+#    npm install && npm ci && npx playwright install --with-deps
+#    node -v && npm -v && pip list
+#
+#    exit_status=0
+#    npx playwright test || exit_status=$?
+#
+#    if [ $exit_status -ne 0 ]; then
+#        echo "[TEST INFO]: ---------frontend test failed---------"
+#        exit $exit_status
+#    else
+#        echo "[TEST INFO]: ---------frontend test passed---------"
+#    fi
+#}
+
 function stop_docker() {
    cd $WORKPATH/docker_compose/amd/gpu/rocm/
    docker compose -f compose_vllm.yaml stop && docker compose -f compose_vllm.yaml rm -f
@@ -93,26 +125,16 @@ function stop_docker() {

 function main() {

-    echo "::group::stop_docker"
    stop_docker
-    echo "::endgroup::"
-
-    echo "::group::build_docker_images"
    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    echo "::endgroup::"
-
-    echo "::group::start_services"
    start_services
-    echo "::endgroup::"

-    echo "::group::validate_megaservice"
    validate_megaservice
-    echo "::endgroup::"
+    # Frontend tests are currently disabled
+    # validate_frontend

-    echo "::group::stop_docker"
    stop_docker
-    docker system prune -f
-    echo "::endgroup::"
+    echo y | docker system prune

 }

--- a/AudioQnA/tests/test_gmc_on_gaudi.sh
+++ b/AudioQnA/tests/test_gmc_on_gaudi.sh
@@ -2,8 +2,6 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-# ===== Deprecated =====
-
 set -xe
 USER_ID=$(whoami)
 LOG_PATH=/home/$(whoami)/logs
--- a/AudioQnA/tests/test_gmc_on_xeon.sh
+++ b/AudioQnA/tests/test_gmc_on_xeon.sh
@@ -2,8 +2,6 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-# ===== Deprecated =====
-
 set -xe
 USER_ID=$(whoami)
 LOG_PATH=/home/$(whoami)/logs
--- a/AvatarChatbot/Dockerfile
+++ b/AvatarChatbot/Dockerfile
@@ -1,8 +1,48 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-ARG BASE_TAG=latest
-FROM opea/comps-base:$BASE_TAG
+# Stage 1: base setup used by other stages
+FROM python:3.11-slim AS base
+
+# get security updates
+RUN apt-get update && apt-get upgrade -y && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+ENV HOME=/home/user
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p $HOME && \
+    chown -R user $HOME
+
+WORKDIR $HOME
+
+
+# Stage 2: latest GenAIComps sources
+FROM base AS git
+
+RUN apt-get update && apt-get install -y --no-install-recommends git
+RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
+
+
+# Stage 3: common layer shared by services using GenAIComps
+FROM base AS comps-base
+
+# copy just relevant parts
+COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
+COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
+
+WORKDIR $HOME/GenAIComps
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
+WORKDIR $HOME
+
+ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
+
+USER user
+
+
+# Stage 4: unique part
+FROM comps-base

 COPY ./avatarchatbot.py $HOME/avatarchatbot.py

--- a/AvatarChatbot/docker_compose/amd/gpu/rocm/README.md
+++ b/AvatarChatbot/docker_compose/amd/gpu/rocm/README.md
@@ -14,7 +14,7 @@ cd GenAIComps
 ### 2. Build ASR Image

 ```bash
-docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/whisper/src/Dockerfile .
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .


 docker build -t opea/asr:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/Dockerfile .
@@ -29,7 +29,7 @@ docker build --no-cache -t opea/llm-textgen:latest --build-arg https_proxy=$http
 ### 4. Build TTS Image

 ```bash
-docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/speecht5/src/Dockerfile .
+docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile .

 docker build -t opea/tts:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/Dockerfile .
 ```
--- a/AvatarChatbot/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AvatarChatbot/docker_compose/amd/gpu/rocm/compose.yaml
@@ -42,12 +42,12 @@ services:
    environment:
      TTS_ENDPOINT: ${TTS_ENDPOINT}
  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.1-rocm
+    image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
    container_name: tgi-service
    ports:
      - "${TGI_SERVICE_PORT:-3006}:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
@@ -66,6 +66,24 @@ services:
      - seccomp:unconfined
    ipc: host
    command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192
+  llm:
+    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
+    container_name: llm-tgi-server
+    depends_on:
+      - tgi-service
+    ports:
+      - "3007:9000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+      LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      OPENAI_API_KEY: ${OPENAI_API_KEY}
+    restart: unless-stopped
  wav2lip-service:
    image: ${REGISTRY:-opea}/wav2lip:${TAG:-latest}
    container_name: wav2lip-service
@@ -107,7 +125,7 @@ services:
    container_name: avatarchatbot-backend-server
    depends_on:
      - asr
-      - tgi-service
+      - llm
      - tts
      - animation
    ports:
--- a/AvatarChatbot/docker_compose/amd/gpu/rocm/set_env.sh
+++ b/AvatarChatbot/docker_compose/amd/gpu/rocm/set_env.sh
@@ -30,7 +30,7 @@ export ANIMATION_SERVICE_HOST_IP=${host_ip}
 export MEGA_SERVICE_PORT=8888
 export ASR_SERVICE_PORT=3001
 export TTS_SERVICE_PORT=3002
-export LLM_SERVICE_PORT=3006
+export LLM_SERVICE_PORT=3007
 export ANIMATION_SERVICE_PORT=3008

 export DEVICE="cpu"
--- a/AvatarChatbot/docker_compose/intel/cpu/xeon/README.md
+++ b/AvatarChatbot/docker_compose/intel/cpu/xeon/README.md
@@ -14,7 +14,7 @@ cd GenAIComps
 ### 2. Build ASR Image

 ```bash
-docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/whisper/src/Dockerfile .
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .
 ```

 ### 3. Build LLM Image
@@ -24,7 +24,7 @@ Intel Xeon optimized image hosted in huggingface repo will be used for TGI servi
 ### 4. Build TTS Image

 ```bash
-docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/speecht5/src/Dockerfile .
+docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile .
 ```

 ### 5. Build Animation Image
--- a/AvatarChatbot/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/AvatarChatbot/docker_compose/intel/cpu/xeon/compose.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "3006:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
--- a/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md
+++ b/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md
@@ -14,7 +14,7 @@ cd GenAIComps
 ### 2. Build ASR Image

 ```bash
-docker build -t opea/whisper-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/whisper/src/Dockerfile.intel_hpu .
+docker build -t opea/whisper-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile.intel_hpu .
 ```

 ### 3. Build LLM Image
@@ -24,7 +24,7 @@ Intel Gaudi optimized image hosted in huggingface repo will be used for TGI serv
 ### 4. Build TTS Image

 ```bash
-docker build -t opea/speecht5-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/speecht5/src/Dockerfile.intel_hpu .
+docker build -t opea/speecht5-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile.intel_hpu .
 ```

 ### 5. Build Animation Image
--- a/AvatarChatbot/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AvatarChatbot/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -43,7 +43,7 @@ services:
    ports:
      - "3006:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "./data:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
--- a/AvatarChatbot/docker_image_build/build.yaml
+++ b/AvatarChatbot/docker_image_build/build.yaml
@@ -14,13 +14,13 @@ services:
  whisper-gaudi:
    build:
      context: GenAIComps
-      dockerfile: comps/third_parties/whisper/src/Dockerfile.intel_hpu
+      dockerfile: comps/asr/src/integrations/dependency/whisper/Dockerfile.intel_hpu
    extends: avatarchatbot
    image: ${REGISTRY:-opea}/whisper-gaudi:${TAG:-latest}
  whisper:
    build:
      context: GenAIComps
-      dockerfile: comps/third_parties/whisper/src/Dockerfile
+      dockerfile: comps/asr/src/integrations/dependency/whisper/Dockerfile
    extends: avatarchatbot
    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
  asr:
@@ -38,13 +38,13 @@ services:
  speecht5-gaudi:
    build:
      context: GenAIComps
-      dockerfile: comps/third_parties/speecht5/src/Dockerfile.intel_hpu
+      dockerfile: comps/tts/src/integrations/dependency/speecht5/Dockerfile.intel_hpu
    extends: avatarchatbot
    image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest}
  speecht5:
    build:
      context: GenAIComps
-      dockerfile: comps/third_parties/speecht5/src/Dockerfile
+      dockerfile: comps/tts/src/integrations/dependency/speecht5/Dockerfile
    extends: avatarchatbot
    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
  tts:
--- a/AvatarChatbot/tests/test_compose_on_gaudi.sh
+++ b/AvatarChatbot/tests/test_compose_on_gaudi.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -87,16 +86,15 @@ function start_services() {
    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
    until [[ "$n" -ge 200 ]]; do
-       docker logs tgi-gaudi-server > $LOG_PATH/tgi_service_start.log && docker logs whisper-service 2>&1 | tee $LOG_PATH/whisper_service_start.log && docker logs speecht5-service 2>&1 | tee $LOG_PATH/speecht5_service_start.log
-       if grep -q Connected $LOG_PATH/tgi_service_start.log && grep -q running $LOG_PATH/whisper_service_start.log && grep -q running $LOG_PATH/speecht5_service_start.log; then
+       docker logs tgi-gaudi-server > $LOG_PATH/tgi_service_start.log
+       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
           break
       fi
-       sleep 10s
+       sleep 5s
       n=$((n+1))
    done
    echo "All services are up and running"
-    # sleep 5s
-    sleep 1m
+    sleep 5s
 }


--- a/AvatarChatbot/tests/test_compose_on_rocm.sh
+++ b/AvatarChatbot/tests/test_compose_on_rocm.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"/var/lib/GenAI/data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -27,7 +26,7 @@ function build_docker_images() {
    git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="avatarchatbot whisper asr speecht5 tts wav2lip animation"
+    service_list="avatarchatbot whisper asr llm-textgen speecht5 tts wav2lip animation"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
@@ -65,7 +64,7 @@ function start_services() {
    export MEGA_SERVICE_PORT=8888
    export ASR_SERVICE_PORT=3001
    export TTS_SERVICE_PORT=3002
-    export LLM_SERVICE_PORT=3006
+    export LLM_SERVICE_PORT=3007
    export ANIMATION_SERVICE_PORT=3008

    export DEVICE="cpu"
--- a/AvatarChatbot/tests/test_compose_on_xeon.sh
+++ b/AvatarChatbot/tests/test_compose_on_xeon.sh
@@ -9,7 +9,6 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
-export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -86,16 +85,15 @@ function start_services() {
    # Start Docker Containers
    docker compose up -d
    n=0
-    until [[ "$n" -ge 200 ]]; do
-       docker logs tgi-service > $LOG_PATH/tgi_service_start.log && docker logs whisper-service 2>&1 | tee $LOG_PATH/whisper_service_start.log && docker logs speecht5-service 2>&1 | tee $LOG_PATH/speecht5_service_start.log
-       if grep -q Connected $LOG_PATH/tgi_service_start.log && grep -q running $LOG_PATH/whisper_service_start.log && grep -q running $LOG_PATH/speecht5_service_start.log; then
+    until [[ "$n" -ge 100 ]]; do
+       docker logs tgi-service > $LOG_PATH/tgi_service_start.log
+       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
           break
       fi
-       sleep 10s
+       sleep 5s
       n=$((n+1))
    done
    echo "All services are up and running"
-    sleep 1m
 }


@@ -106,7 +104,6 @@ function validate_megaservice() {
    if [[ $result == *"mp4"* ]]; then
        echo "Result correct."
    else
-        echo "Result wrong, print docker logs."
        docker logs whisper-service > $LOG_PATH/whisper-service.log
        docker logs speecht5-service > $LOG_PATH/speecht5-service.log
        docker logs tgi-service > $LOG_PATH/tgi-service.log
@@ -120,6 +117,11 @@ function validate_megaservice() {
 }


+#function validate_frontend() {
+
+#}
+
+
 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/cpu/xeon
    docker compose down
@@ -127,6 +129,7 @@ function stop_docker() {


 function main() {
+
    stop_docker
    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
    start_services
--- a/ChatQnA/README.md
+++ b/ChatQnA/README.md
@@ -1,22 +1,148 @@
 # ChatQnA Application

-Chatbots are the most widely adopted use case for leveraging the powerful chat and reasoning capabilities of large language models (LLMs). The retrieval augmented generation (RAG) architecture is quickly becoming the industry standard for chatbot development. It combines the benefits of a knowledge base (via a vector store) and generative models to reduce hallucinations, maintain up-to-date information, and leverage domain-specific knowledge.
+Chatbots are the most widely adopted use case for leveraging the powerful chat and reasoning capabilities of large language models (LLMs). The retrieval augmented generation (RAG) architecture is quickly becoming the industry standard for chatbots development. It combines the benefits of a knowledge base (via a vector store) and generative models to reduce hallucinations, maintain up-to-date information, and leverage domain-specific knowledge.

-RAG bridges the knowledge gap by dynamically fetching relevant information from external sources, ensuring that the response generated remains factual and current. Vector databases are at the core of this architecture, enabling efficient retrieval of semantically relevant information. These databases store data as vectors, allowing RAG to swiftly access the most pertinent documents or data points based on semantic similarity.
+RAG bridges the knowledge gap by dynamically fetching relevant information from external sources, ensuring that responses generated remain factual and current. The core of this architecture are vector databases, which are instrumental in enabling efficient and semantic retrieval of information. These databases store data as vectors, allowing RAG to swiftly access the most pertinent documents or data points based on semantic similarity.

-## Table of contents
+# Table of contents

-1. [Architecture](#architecture)
-2. [Deployment Options](#deployment-options)
-3. [Monitoring and Tracing](#monitor-and-tracing)
+1. [Automated Terraform Deployment](#automated-deployment-to-ubuntu-based-systemif-not-using-terraform-using-intel-optimized-cloud-modules-for-ansible)
+2. [Automated Deployment to Ubuntu based system](#automated-deployment-to-ubuntu-based-systemif-not-using-terraform-using-intel-optimized-cloud-modules-for-ansible)
+3. [Manually Deployment](#manually-deploy-chatqna-service)
+4. [Architecture and Deploy Details](#architecture-and-deploy-details)
+5. [Consume Service](#consume-chatqna-service-with-rag)
+6. [Monitoring and Tracing](#monitoring-opea-service-with-prometheus-and-grafana-dashboard)

-## Architecture
+## 🤖 Automated Terraform Deployment using Intel® Optimized Cloud Modules for **Terraform**

-The ChatQnA application is a customizable end-to-end workflow that leverages the capabilities of LLMs and RAG efficiently. ChatQnA architecture is shown below:
+| Cloud Provider       | Intel Architecture                | Intel Optimized Cloud Module for Terraform                                                                                         | Comments                                                             |
+| -------------------- | --------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------- |
+| AWS                  | 4th Gen Intel Xeon with Intel AMX | [AWS Module](https://github.com/intel/terraform-intel-aws-vm/tree/main/examples/gen-ai-xeon-opea-chatqna)                          | Uses meta-llama/Meta-Llama-3-8B-Instruct by default                  |
+| AWS Falcon2-11B      | 4th Gen Intel Xeon with Intel AMX | [AWS Module with Falcon11B](https://github.com/intel/terraform-intel-aws-vm/tree/main/examples/gen-ai-xeon-opea-chatqna-falcon11B) | Uses TII Falcon2-11B LLM Model                                       |
+| GCP                  | 5th Gen Intel Xeon with Intel AMX | [GCP Module](https://github.com/intel/terraform-intel-gcp-vm/tree/main/examples/gen-ai-xeon-opea-chatqna)                          | Also supports Confidential AI by using Intel® TDX with 4th Gen Xeon |
+| Azure                | 5th Gen Intel Xeon with Intel AMX | Work-in-progress                                                                                                                   | Work-in-progress                                                     |
+| Intel Tiber AI Cloud | 5th Gen Intel Xeon with Intel AMX | Work-in-progress                                                                                                                   | Work-in-progress                                                     |

+## Automated Deployment to Ubuntu based system (if not using Terraform) using Intel® Optimized Cloud Modules for **Ansible**
+
+To deploy to existing Xeon Ubuntu based system, use our Intel Optimized Cloud Modules for Ansible. This is the same Ansible playbook used by Terraform.
+Use this if you are not using Terraform and have provisioned your system with another tool or manually including bare metal.
+
+| Operating System | Intel Optimized Cloud Module for Ansible                                                                          |
+| ---------------- | ----------------------------------------------------------------------------------------------------------------- |
+| Ubuntu 20.04     | [ChatQnA Ansible Module](https://github.com/intel/optimized-cloud-recipes/tree/main/recipes/ai-opea-chatqna-xeon) |
+| Ubuntu 22.04     | Work-in-progress                                                                                                  |
+
+## Manually Deploy ChatQnA Service
+
+The ChatQnA service can be effortlessly deployed on Intel Gaudi2, Intel Xeon Scalable Processors，Nvidia GPU and AMD GPU.
+
+Two types of ChatQnA pipeline are supported now: `ChatQnA with/without Rerank`. And the `ChatQnA without Rerank` pipeline (including Embedding, Retrieval, and LLM) is offered for Xeon customers who can not run rerank service on HPU yet require high performance and accuracy.
+
+Quick Start Deployment Steps:
+
+1. Set up the environment variables.
+2. Run Docker Compose.
+3. Consume the ChatQnA Service.
+
+Note:
+
+1. If you do not have docker installed you can run this script to install docker : `bash docker_compose/install_docker.sh`.
+
+2. The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) `or` you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
+
+### Quick Start: 1.Setup Environment Variable
+
+To set up environment variables for deploying ChatQnA services, follow these steps:
+
+1. Set the required environment variables:
+
+   ```bash
+   # Example: host_ip="192.168.1.1"
+   export host_ip="External_Public_IP"
+   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+   export no_proxy="Your_No_Proxy"
+   export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
+   ```
+
+2. If you are in a proxy environment, also set the proxy-related environment variables:
+
+   ```bash
+   export http_proxy="Your_HTTP_Proxy"
+   export https_proxy="Your_HTTPs_Proxy"
+   ```
+
+3. Set up other environment variables:
+
+   > Notice that you can only choose **one** hardware option below to set up envs according to your hardware. Make sure port numbers are set correctly as well.
+
+   ```bash
+   # on Gaudi
+   cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
+   source ./set_env.sh
+   export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails,jaeger,prometheus,grafana,gaudi-node-exporter-1
+   # on Xeon
+   cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+   source ./set_env.sh
+   export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,jaeger,prometheus,grafana,xeon-node-exporter-1
+   # on Nvidia GPU
+   cd GenAIExamples/ChatQnA/docker_compose/nvidia/gpu
+   source ./set_env.sh
+   export no_proxy="Your_No_Proxy",chatqna-ui-server,chatqna-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service
+   ```
+
+### Quick Start: 2.Run Docker Compose
+
+Select the compose.yaml file that matches your hardware.
+
+CPU example:
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+# cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
+# cd GenAIExamples/ChatQnA/docker_compose/nvidia/gpu/
+docker compose up -d
+```
+
+To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.  
+CPU example with Open Telemetry feature:
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+```
+
+It will automatically download the docker image on `docker hub`:
+
+```bash
+docker pull opea/chatqna:latest
+docker pull opea/chatqna-ui:latest
+```
+
+In following cases, you could build docker image from source by yourself.
+
+- Failed to download the docker image.
+
+- If you want to use a specific version of Docker image.
+
+Please refer to the 'Build Docker Images' in [Guide](docker_compose/intel/cpu/xeon/README.md).
+
+### QuickStart: 3.Consume the ChatQnA Service
+
+```bash
+curl http://${host_ip}:8888/v1/chatqna \
+    -H "Content-Type: application/json" \
+    -d '{
+        "messages": "What is the revenue of Nike in 2023?"
+    }'
+```
+
+## Architecture and Deploy details
+
+ChatQnA architecture shows below:
 ![architecture](./assets/img/chatqna_architecture.png)

-This application is modular as it leverages each component as a microservice(as defined in [GenAIComps](https://github.com/opea-project/GenAIComps)) that can scale independently. It comprises data preparation, embedding, retrieval, reranker(optional) and LLM microservices. All these microservices are stitched together by the ChatQnA megaservice that orchestrates the data through these microservices. The flow chart below shows the information flow between different microservices for this example.
+The ChatQnA example is implemented using the component-level microservices defined in [GenAIComps](https://github.com/opea-project/GenAIComps). The flow chart below shows the information flow between different microservices for this example.

 ```mermaid
 ---
@@ -92,30 +218,192 @@ flowchart LR

 ```

-## Deployment Options
+This ChatQnA use case performs RAG using LangChain, Redis VectorDB and Text Generation Inference on [Intel Gaudi2](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi-overview.html) or [Intel Xeon Scalable Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon.html).
+In the below, we provide a table that describes for each microservice component in the ChatQnA architecture, the default configuration of the open source project, hardware, port, and endpoint.

-The table below lists currently available deployment options. They outline in detail the implementation of this example on selected hardware.
+Gaudi default compose.yaml

-| Category                | Deployment Option            | Description                                                                                                                                                                                                                                                                          |
-| ----------------------- | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| On-premise Deployments  | Docker compose               | [ChatQnA deployment on Xeon](./docker_compose/intel/cpu/xeon)                                                                                                                                                                                                                        |
-|                         |                              | [ChatQnA deployment on AI PC](./docker_compose/intel/cpu/aipc)                                                                                                                                                                                                                       |
-|                         |                              | [ChatQnA deployment on Gaudi](./docker_compose/intel/hpu/gaudi)                                                                                                                                                                                                                      |
-|                         |                              | [ChatQnA deployment on Nvidia GPU](./docker_compose/nvidia/gpu)                                                                                                                                                                                                                      |
-|                         |                              | [ChatQnA deployment on AMD ROCm](./docker_compose/amd/gpu/rocm)                                                                                                                                                                                                                      |
-|                         | Kubernetes                   | [Helm Charts](./kubernetes/helm)                                                                                                                                                                                                                                                     |
-| Cloud Service Providers | AWS                          | [Terraform deployment on 4th Gen Intel Xeon with Intel AMX using meta-llama/Meta-Llama-3-8B-Instruct ](https://github.com/intel/terraform-intel-aws-vm/tree/main/examples/gen-ai-xeon-opea-chatqna)                                                                                  |
-|                         |                              | [Terraform deployment on 4th Gen Intel Xeon with Intel AMX using TII Falcon2-11B](https://github.com/intel/terraform-intel-aws-vm/tree/main/examples/gen-ai-xeon-opea-chatqna-falcon11B)                                                                                             |
-|                         | GCP                          | [Terraform deployment on 5th Gen Intel Xeon with Intel AMX(support Confidential AI by using Intel® TDX](https://github.com/intel/terraform-intel-gcp-vm/tree/main/examples/gen-ai-xeon-opea-chatqna)                                                                                |
-|                         | Azure                        | [Terraform deployment on 4th/5th Gen Intel Xeon with Intel AMX & Intel TDX](https://github.com/intel/terraform-intel-azure-linux-vm/tree/main/examples/azure-gen-ai-xeon-opea-chatqna-tdx)                                                                                           |
-|                         | Intel Tiber AI Cloud         | Coming Soon                                                                                                                                                                                                                                                                          |
-|                         | Any Xeon based Ubuntu system | [ChatQnA Ansible Module for Ubuntu 20.04](https://github.com/intel/optimized-cloud-recipes/tree/main/recipes/ai-opea-chatqna-xeon) .Use this if you are not using Terraform and have provisioned your system either manually or with another tool, including directly on bare metal. |
+| MicroService | Open Source Project | HW    | Port | Endpoint             |
+| ------------ | ------------------- | ----- | ---- | -------------------- |
+| Embedding    | Langchain           | Xeon  | 6000 | /v1/embeddings       |
+| Retriever    | Langchain, Redis    | Xeon  | 7000 | /v1/retrieval        |
+| Reranking    | Langchain, TEI      | Gaudi | 8000 | /v1/reranking        |
+| LLM          | Langchain, TGI      | Gaudi | 9000 | /v1/chat/completions |
+| Dataprep     | Redis, Langchain    | Xeon  | 6007 | /v1/dataprep/ingest  |

-## Monitor and Tracing
+### Required Models

-Follow [OpenTelemetry OPEA Guide](https://opea-project.github.io/latest/tutorial/OpenTelemetry/OpenTelemetry_OPEA_Guide.html) to understand how to use OpenTelemetry tracing and metrics in OPEA.  
-For ChatQnA specific tracing and metrics monitoring, follow [OpenTelemetry on ChatQnA](https://opea-project.github.io/latest/tutorial/OpenTelemetry/deploy/ChatQnA.html) section.
+By default, the embedding, reranking and LLM models are set to a default value as listed below:

-## FAQ Generation Application
+| Service   | Model                               |
+| --------- | ----------------------------------- |
+| Embedding | BAAI/bge-base-en-v1.5               |
+| Reranking | BAAI/bge-reranker-base              |
+| LLM       | meta-llama/Meta-Llama-3-8B-Instruct |

-FAQ Generation Application leverages the power of large language models (LLMs) to revolutionize the way you interact with and comprehend complex textual data. By harnessing cutting-edge natural language processing techniques, our application can automatically generate comprehensive and natural-sounding frequently asked questions (FAQs) from your documents, legal texts, customer queries, and other sources. We merged the FaqGen into the ChatQnA example, which utilize LangChain to implement FAQ Generation and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors.
+Change the `xxx_MODEL_ID` in `docker_compose/xxx/set_env.sh` for your needs.
+
+For customers with proxy issues, the models from [ModelScope](https://www.modelscope.cn/models) are also supported in ChatQnA. Refer to [this readme](docker_compose/intel/cpu/xeon/README.md) for details.
+
+### Deploy ChatQnA on Gaudi
+
+Find the corresponding [compose.yaml](./docker_compose/intel/hpu/gaudi/compose.yaml).
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
+docker compose up -d
+```
+
+To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+```
+
+Refer to the [Gaudi Guide](./docker_compose/intel/hpu/gaudi/README.md) to build docker images from source.
+
+### Deploy ChatQnA on Xeon
+
+Find the corresponding [compose.yaml](./docker_compose/intel/cpu/xeon/compose.yaml).
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+docker compose up -d
+```
+
+To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+```
+
+Refer to the [Xeon Guide](./docker_compose/intel/cpu/xeon/README.md) for more instructions on building docker images from source.
+
+### Deploy ChatQnA on NVIDIA GPU
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/nvidia/gpu/
+docker compose up -d
+```
+
+Refer to the [NVIDIA GPU Guide](./docker_compose/nvidia/gpu/README.md) for more instructions on building docker images from source.
+
+### Deploy ChatQnA on Kubernetes using Helm Chart
+
+Refer to the [ChatQnA helm chart](./kubernetes/helm/README.md) for instructions on deploying ChatQnA on Kubernetes.
+
+### Deploy ChatQnA on AI PC
+
+Refer to the [AI PC Guide](./docker_compose/intel/cpu/aipc/README.md) for instructions on deploying ChatQnA on AI PC.
+
+### Deploy ChatQnA on Red Hat OpenShift Container Platform (RHOCP)
+
+Refer to the [Intel Technology enabling for Openshift readme](https://github.com/intel/intel-technology-enabling-for-openshift/blob/main/workloads/opea/chatqna/README.md) for instructions to deploy ChatQnA prototype on RHOCP with [Red Hat OpenShift AI (RHOAI)](https://www.redhat.com/en/technologies/cloud-computing/openshift/openshift-ai).
+
+## Consume ChatQnA Service with RAG
+
+### Check Service Status
+
+Before consuming ChatQnA Service, make sure the vLLM/TGI service is ready, which takes some time.
+
+```bash
+# vLLM example
+docker logs vllm-gaudi-server 2>&1 | grep complete
+# TGI example
+docker logs tgi-gaudi-server | grep Connected
+```
+
+Consume ChatQnA service until you get the response like below.
+
+```log
+# vLLM
+INFO: Application startup complete.
+# TGI
+2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
+```
+
+### Upload RAG Files (Optional)
+
+To chat with retrieved information, you need to upload a file using `Dataprep` service.
+
+Here is an example of `Nike 2023` pdf.
+
+```bash
+# download pdf file
+wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf
+# upload pdf file with dataprep
+curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \
+    -H "Content-Type: multipart/form-data" \
+    -F "files=@./nke-10k-2023.pdf"
+```
+
+### Consume Chat Service
+
+Two ways of consuming ChatQnA Service:
+
+1. Use cURL command on terminal
+
+   ```bash
+   curl http://${host_ip}:8888/v1/chatqna \
+       -H "Content-Type: application/json" \
+       -d '{
+           "messages": "What is the revenue of Nike in 2023?"
+       }'
+   ```
+
+2. Access via frontend
+
+   To access the frontend, open the following URL in your browser: `http://{host_ip}:5173`
+
+   By default, the UI runs on port 5173 internally.
+
+   If you choose conversational UI, use this URL: `http://{host_ip}:5174`
+
+## Troubleshooting
+
+1. If you get errors like "Access Denied", [validate micro service](https://github.com/opea-project/GenAIExamples/tree/main/ChatQnA/docker_compose/intel/cpu/xeon/README.md#validate-microservices) first. A simple example:
+
+   ```bash
+   http_proxy="" curl ${host_ip}:6006/embed -X POST  -d '{"inputs":"What is Deep Learning?"}' -H 'Content-Type: application/json'
+   ```
+
+2. (Docker only) If all microservices work well, check the port ${host_ip}:8888, the port may be allocated by other users, you can modify the `compose.yaml`.
+
+3. (Docker only) If you get errors like "The container name is in use", change container name in `compose.yaml`.
+
+## Monitoring OPEA Service with Prometheus and Grafana dashboard
+
+OPEA microservice deployment can easily be monitored through Grafana dashboards in conjunction with Prometheus data collection. Follow the [README](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/grafana/README.md) to setup Prometheus and Grafana servers and import dashboards to monitor the OPEA service.
+
+![chatqna dashboards](./assets/img/chatqna_dashboards.png)
+![tgi dashboard](./assets/img/tgi_dashboard.png)
+
+## Tracing Services with OpenTelemetry Tracing and Jaeger
+
+> NOTE: This feature is disabled by default. Please check the Deploy ChatQnA sessions for how to enable this feature with compose.telemetry.yaml file.
+
+OPEA microservice and TGI/TEI serving can easily be traced through Jaeger dashboards in conjunction with OpenTelemetry Tracing feature. Follow the [README](https://github.com/opea-project/GenAIComps/tree/main/comps/cores/telemetry#tracing) to trace additional functions if needed.
+
+Tracing data is exported to http://{EXTERNAL_IP}:4318/v1/traces via Jaeger.
+Users could also get the external IP via below command.
+
+```bash
+ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+'
+```
+
+Access the Jaeger dashboard UI at http://{EXTERNAL_IP}:16686
+
+For TGI serving on Gaudi, users could see different services like opea, TEI and TGI.
+![Screenshot from 2024-12-27 11-58-18](https://github.com/user-attachments/assets/6126fa70-e830-4780-bd3f-83cb6eff064e)
+
+Here is a screenshot for one tracing of TGI serving request.
+![Screenshot from 2024-12-27 11-26-25](https://github.com/user-attachments/assets/3a7c51c6-f422-41eb-8e82-c3df52cd48b8)
+
+There are also OPEA related tracings. Users could understand the time breakdown of each service request by looking into each opea:schedule operation.
+![image](https://github.com/user-attachments/assets/6137068b-b374-4ff8-b345-993343c0c25f)
+
+There could be async function such as `llm/MicroService_asyn_generate` and user needs to check the trace of the async function in another operation like
+opea:llm_generate_stream.
+![image](https://github.com/user-attachments/assets/a973d283-198f-4ce2-a7eb-58515b77503e)
--- a/ChatQnA/README_miscellaneous.md
+++ b/ChatQnA/README_miscellaneous.md
@@ -1,86 +0,0 @@
-# ChatQnA Docker Image Build
-
-## Table of contents
-
-1. [Build MegaService Docker Image](#Build-MegaService-Docker-Image)
-2. [Build Basic UI Docker Image](#Build-Basic-UI-Docker-Image)
-3. [Build Conversational React UI Docker Image](#Build-Conversational-React-UI-Docker-Image)
-4. [Troubleshooting](#Troubleshooting)
-
-## Build MegaService Docker Image
-
-To construct the MegaService with Rerank, we utilize the [GenAIExamples](https://github.com/opea-project/GenAIExamples.git) microservice pipeline within the `chatqna.py` Python script. Build the MegaService Docker image using the command below:
-
-```bash
-git clone https://github.com/opea-project/GenAIExamples.git
-git fetch && git checkout tags/v1.2
-cd GenAIExamples/ChatQnA
-docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-```
-
-## Build Basic UI Docker Image
-
-Build the Frontend Docker Image using the command below:
-
-```bash
-cd GenAIExamples/ChatQnA/ui
-docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
-```
-
-## Build Conversational React UI Docker Image (Optional)
-
-Build a frontend Docker image for an interactive conversational UI experience with ChatQnA MegaService
-
-**Export the value of the public IP address of your host machine server to the `host_ip` environment variable**
-
-```bash
-cd GenAIExamples/ChatQnA/ui
-docker build --no-cache -t opea/chatqna-conversation-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
-```
-
-## Troubleshooting
-
-1. If you get errors like "Access Denied", [validate microservices](https://github.com/opea-project/GenAIExamples/tree/main/ChatQnA/docker_compose/intel/cpu/xeon/README.md#validate-microservices) first. A simple example:
-
-   ```bash
-   http_proxy="" curl ${host_ip}:6006/embed -X POST  -d '{"inputs":"What is Deep Learning?"}' -H 'Content-Type: application/json'
-   ```
-
-2. (Docker only) If all microservices work well, check the port ${host_ip}:8888, the port may be allocated by other users, you can modify the `compose.yaml`.
-
-3. (Docker only) If you get errors like "The container name is in use", change container name in `compose.yaml`.
-
-## Monitoring OPEA Services with Prometheus and Grafana Dashboard
-
-OPEA microservice deployment can easily be monitored through Grafana dashboards using data collected via Prometheus. Follow the [README](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/grafana/README.md) to setup Prometheus and Grafana servers and import dashboards to monitor the OPEA services.
-
-![chatqna dashboards](./assets/img/chatqna_dashboards.png)
-![tgi dashboard](./assets/img/tgi_dashboard.png)
-
-## Tracing with OpenTelemetry and Jaeger
-
-> NOTE: This feature is disabled by default. Please use the compose.telemetry.yaml file to enable this feature.
-
-OPEA microservice and [TGI](https://huggingface.co/docs/text-generation-inference/en/index)/[TEI](https://huggingface.co/docs/text-embeddings-inference/en/index) serving can easily be traced through [Jaeger](https://www.jaegertracing.io/) dashboards in conjunction with [OpenTelemetry](https://opentelemetry.io/) Tracing feature. Follow the [README](https://github.com/opea-project/GenAIComps/tree/main/comps/cores/telemetry#tracing) to trace additional functions if needed.
-
-Tracing data is exported to http://{EXTERNAL_IP}:4318/v1/traces via Jaeger.
-Users could also get the external IP via below command.
-
-```bash
-ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+'
-```
-
-Access the Jaeger dashboard UI at http://{EXTERNAL_IP}:16686
-
-For TGI serving on Gaudi, users could see different services like opea, TEI and TGI.
-![Screenshot from 2024-12-27 11-58-18](https://github.com/user-attachments/assets/6126fa70-e830-4780-bd3f-83cb6eff064e)
-
-Here is a screenshot for one tracing of TGI serving request.
-![Screenshot from 2024-12-27 11-26-25](https://github.com/user-attachments/assets/3a7c51c6-f422-41eb-8e82-c3df52cd48b8)
-
-There are also OPEA related tracings. Users could understand the time breakdown of each service request by looking into each opea:schedule operation.
-![image](https://github.com/user-attachments/assets/6137068b-b374-4ff8-b345-993343c0c25f)
-
-There could be asynchronous function such as `llm/MicroService_asyn_generate` and user needs to check the trace of the asynchronous function in another operation like
-opea:llm_generate_stream.
-![image](https://github.com/user-attachments/assets/a973d283-198f-4ce2-a7eb-58515b77503e)
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -25,7 +25,7 @@ class ChatTemplate:
    @staticmethod
    def generate_rag_prompt(question, documents):
        context_str = "\n".join(documents)
-        if context_str and len(re.findall("[\u4e00-\u9fff]", context_str)) / len(context_str) >= 0.3:
+        if context_str and len(re.findall("[\u4E00-\u9FFF]", context_str)) / len(context_str) >= 0.3:
            # chinese context
            template = """
 ### 你将扮演一个乐于助人、尊重他人并诚实的助手，你的目标是帮助用户解答问题。有效地利用来自本地知识库的搜索结果。确保你的回答中只包含相关信息。如果你不确定问题的答案，请避免分享不准确的信息。
@@ -58,7 +58,6 @@ RERANK_SERVER_PORT = int(os.getenv("RERANK_SERVER_PORT", 80))
 LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
 LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 80))
 LLM_MODEL = os.getenv("LLM_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct")
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", None)


 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
@@ -240,7 +239,6 @@ class ChatQnAService:
            name="llm",
            host=LLM_SERVER_HOST_IP,
            port=LLM_SERVER_PORT,
-            api_key=OPENAI_API_KEY,
            endpoint="/v1/chat/completions",
            use_remote_service=True,
            service_type=ServiceType.LLM,
@@ -274,7 +272,6 @@ class ChatQnAService:
            name="llm",
            host=LLM_SERVER_HOST_IP,
            port=LLM_SERVER_PORT,
-            api_key=OPENAI_API_KEY,
            endpoint="/v1/chat/completions",
            use_remote_service=True,
            service_type=ServiceType.LLM,
@@ -320,7 +317,6 @@ class ChatQnAService:
            name="llm",
            host=LLM_SERVER_HOST_IP,
            port=LLM_SERVER_PORT,
-            api_key=OPENAI_API_KEY,
            endpoint="/v1/chat/completions",
            use_remote_service=True,
            service_type=ServiceType.LLM,
--- a/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -25,15 +25,9 @@ services:
      INDEX_NAME: ${CHATQNA_INDEX_NAME}
      TEI_ENDPOINT: ${CHATQNA_TEI_EMBEDDING_ENDPOINT}
      HUGGINGFACEHUB_API_TOKEN: ${CHATQNA_HUGGINGFACEHUB_API_TOKEN}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
-      interval: 10s
-      timeout: 5s
-      retries: 50
-    restart: unless-stopped

  chatqna-tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: chatqna-tei-embedding-service
    ports:
      - "${CHATQNA_TEI_EMBEDDING_PORT}:80"
@@ -68,7 +62,7 @@ services:
    restart: unless-stopped

  chatqna-tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: chatqna-tei-reranking-service
    ports:
      - "${CHATQNA_TEI_RERANKING_PORT}:80"
@@ -115,18 +109,11 @@ services:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-backend-server
    depends_on:
-      chatqna-redis-vector-db:
-        condition: service_started
-      chatqna-tei-embedding-service:
-        condition: service_started
-      chatqna-retriever:
-        condition: service_started
-      chatqna-tei-reranking-service:
-        condition: service_started
-      chatqna-tgi-service:
-        condition: service_started
-      chatqna-dataprep-service:
-        condition: service_healthy
+      - chatqna-redis-vector-db
+      - chatqna-tei-embedding-service
+      - chatqna-retriever
+      - chatqna-tei-reranking-service
+      - chatqna-tgi-service
    ports:
      - "${CHATQNA_BACKEND_SERVICE_PORT:-8888}:8888"
    environment:
--- a/ChatQnA/docker_compose/amd/gpu/rocm/compose_faqgen.yaml
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/compose_faqgen.yaml
@@ -25,15 +25,9 @@ services:
      INDEX_NAME: ${CHATQNA_INDEX_NAME}
      TEI_ENDPOINT: ${CHATQNA_TEI_EMBEDDING_ENDPOINT}
      HUGGINGFACEHUB_API_TOKEN: ${CHATQNA_HUGGINGFACEHUB_API_TOKEN}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
-      interval: 10s
-      timeout: 5s
-      retries: 50
-    restart: unless-stopped

  chatqna-tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: chatqna-tei-embedding-service
    ports:
      - "${CHATQNA_TEI_EMBEDDING_PORT}:80"
@@ -68,7 +62,7 @@ services:
    restart: unless-stopped

  chatqna-tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: chatqna-tei-reranking-service
    ports:
      - "${CHATQNA_TEI_RERANKING_PORT}:80"
@@ -134,20 +128,12 @@ services:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-backend-server
    depends_on:
-      chatqna-redis-vector-db:
-        condition: service_started
-      chatqna-tei-embedding-service:
-        condition: service_started
-      chatqna-retriever:
-        condition: service_started
-      chatqna-tei-reranking-service:
-        condition: service_started
-      chatqna-tgi-service:
-        condition: service_started
-      chatqna-llm-faqgen:
-        condition: service_started
-      chatqna-dataprep-service:
-        condition: service_healthy
+      - chatqna-redis-vector-db
+      - chatqna-tei-embedding-service
+      - chatqna-retriever
+      - chatqna-tei-reranking-service
+      - chatqna-tgi-service
+      - chatqna-llm-faqgen
    ports:
      - "${CHATQNA_BACKEND_SERVICE_PORT}:8888"
    environment:
--- a/ChatQnA/docker_compose/amd/gpu/rocm/compose_faqgen_vllm.yaml
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/compose_faqgen_vllm.yaml
@@ -25,12 +25,6 @@ services:
      INDEX_NAME: ${CHATQNA_INDEX_NAME}
      TEI_ENDPOINT: ${CHATQNA_TEI_EMBEDDING_ENDPOINT}
      HUGGINGFACEHUB_API_TOKEN: ${CHATQNA_HUGGINGFACEHUB_API_TOKEN}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
-      interval: 10s
-      timeout: 5s
-      retries: 50
-    restart: unless-stopped

  chatqna-tei-embedding-service:
    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
@@ -139,20 +133,12 @@ services:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-backend-server
    depends_on:
-      chatqna-redis-vector-db:
-        condition: service_started
-      chatqna-tei-embedding-service:
-        condition: service_started
-      chatqna-retriever:
-        condition: service_started
-      chatqna-tei-reranking-service:
-        condition: service_started
-      chatqna-vllm-service:
-        condition: service_started
-      chatqna-llm-faqgen:
-        condition: service_started
-      chatqna-dataprep-redis-service:
-        condition: service_healthy
+      - chatqna-redis-vector-db
+      - chatqna-tei-embedding-service
+      - chatqna-retriever
+      - chatqna-tei-reranking-service
+      - chatqna-vllm-service
+      - chatqna-llm-faqgen
    ports:
      - "${CHATQNA_BACKEND_SERVICE_PORT}:8888"
    environment:
--- a/ChatQnA/docker_compose/amd/gpu/rocm/compose_vllm.yaml
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/compose_vllm.yaml
@@ -25,12 +25,6 @@ services:
      INDEX_NAME: ${CHATQNA_INDEX_NAME}
      TEI_ENDPOINT: ${CHATQNA_TEI_EMBEDDING_ENDPOINT}
      HUGGINGFACEHUB_API_TOKEN: ${CHATQNA_HUGGINGFACEHUB_API_TOKEN}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
-      interval: 10s
-      timeout: 5s
-      retries: 50
-    restart: unless-stopped

  chatqna-tei-embedding-service:
    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
@@ -117,18 +111,11 @@ services:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-backend-server
    depends_on:
-      chatqna-redis-vector-db:
-        condition: service_started
-      chatqna-tei-embedding-service:
-        condition: service_started
-      chatqna-retriever:
-        condition: service_started
-      chatqna-tei-reranking-service:
-        condition: service_started
-      chatqna-vllm-service:
-        condition: service_started
-      chatqna-dataprep-service:
-        condition: service_healthy
+      - chatqna-redis-vector-db
+      - chatqna-tei-embedding-service
+      - chatqna-retriever
+      - chatqna-tei-reranking-service
+      - chatqna-vllm-service
    ports:
      - "${CHATQNA_BACKEND_SERVICE_PORT}:8888"
    environment:
--- a/ChatQnA/docker_compose/intel/cpu/aipc/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/aipc/compose.yaml
@@ -25,12 +25,6 @@ services:
      INDEX_NAME: ${INDEX_NAME}
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
-      interval: 10s
-      timeout: 5s
-      retries: 50
-    restart: unless-stopped
  tei-embedding-service:
    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
    container_name: tei-embedding-server
@@ -98,16 +92,11 @@ services:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-aipc-backend-server
    depends_on:
-      redis-vector-db:
-        condition: service_started
-      dataprep-redis-service:
-        condition: service_healthy
-      tei-embedding-service:
-        condition: service_started
-      retriever:
-        condition: service_started
-      tei-reranking-service:
-        condition: service_started
+      - redis-vector-db
+      - dataprep-redis-service
+      - tei-embedding-service
+      - retriever
+      - tei-reranking-service
    ports:
      - "8888:8888"
    environment:
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -1,63 +1,56 @@
-# Deploying ChatQnA on Intel® Xeon® Processors
+# Build Mega Service of ChatQnA on Xeon

-This document outlines the single node deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservices on Intel Xeon server. The steps include pulling Docker images, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank` and `llm`.
+This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`,`llm` and `faqgen`.

-## Table of contents
+The default pipeline deploys with vLLM as the LLM serving component and leverages rerank component. It also provides options of not using rerank in the pipeline and using TGI backend for LLM microservice, please refer to [start-all-the-services-docker-containers](#start-all-the-services-docker-containers) section in this page. Besides, refer to [Build with Pinecone VectorDB](./README_pinecone.md) and [Build with Qdrant VectorDB](./README_qdrant.md) for other deployment variants.

-1. [ChatQnA Quick Start Deployment](#chatqna-quick-start-Deployment)
-2. [ChatQnA Docker Compose file Options](#chatqna-docker-compose-files)
-3. [ChatQnA with Conversational UI](#chatqna-with-conversational-ui-optional)
+Quick Start:

-## ChatQnA Quick Start Deployment
+1. Set up the environment variables.
+2. Run Docker Compose.
+3. Consume the ChatQnA Service.

-This section describes how to quickly deploy and test the ChatQnA service manually on an Intel® Xeon® processor. The basic steps are:
+Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).

-1. [Access the Code](#access-the-code)
-2. [Generate a HuggingFace Access Token](#generate-a-huggingface-access-token)
-3. [Configure the Deployment Environment](#configure-the-deployment-environment)
-4. [Deploy the Services Using Docker Compose](#deploy-the-services-using-docker-compose)
-5. [Check the Deployment Status](#check-the-deployment-status)
-6. [Test the Pipeline](#test-the-pipeline)
-7. [Cleanup the Deployment](#cleanup-the-deployment)
+## Quick Start: 1.Setup Environment Variable

-### Access the Code
+To set up environment variables for deploying ChatQnA services, follow these steps:

-Clone the GenAIExample repository and access the ChatQnA Intel® Gaudi® platform Docker Compose files and supporting scripts:
+1. Set the required environment variables:

-```
-git clone https://github.com/opea-project/GenAIExamples.git
-cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
-```
+   ```bash
+   # Example: host_ip="192.168.1.1"
+   export host_ip="External_Public_IP"
+   export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
+   ```

-Checkout a released version, such as v1.2:
+2. If you are in a proxy environment, also set the proxy-related environment variables:

-```
-git checkout v1.2
-```
+   ```bash
+   export http_proxy="Your_HTTP_Proxy"
+   export https_proxy="Your_HTTPs_Proxy"
+   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+   export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,llm-faqgen
+   ```

-### Generate a HuggingFace Access Token
+3. Set up other environment variables:

-Some HuggingFace resources, such as some models, are only accessible if the developer have an access token. In the absence of a HuggingFace access token, the developer can create one by first creating an account by following the steps provided at [HuggingFace](https://huggingface.co/) and then generating a [user access token](https://huggingface.co/docs/transformers.js/en/guides/private#step-1-generating-a-user-access-token).
+   ```bash
+   source ./set_env.sh
+   ```

-### Configure the Deployment Environment
+4. Change Model for LLM serving

-To set up environment variables for deploying ChatQnA services, set up some parameters specific to the deployment environment and source the _setup_env.sh_ script in this directory:
+   By default, Meta-Llama-3-8B-Instruct is used for LLM serving, the default model can be changed to other validated LLM models.  
+   Please pick a [validated llm models](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/src/text-generation#validated-llm-models) from the table.  
+   To change the default model defined in set_env.sh, overwrite it by exporting LLM_MODEL_ID to the new model or by modifying set_env.sh, and then repeat step 3.  
+   For example, change to Llama-2-7b-chat-hf using the following command.

-```
-export host_ip="External_Public_IP"           #ip address of the node
-export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
-export http_proxy="Your_HTTP_Proxy"           #http proxy if any
-export https_proxy="Your_HTTPs_Proxy"         #https proxy if any
-export no_proxy=localhost,127.0.0.1,$host_ip  #additional no proxies if needed
-export no_proxy=$no_proxy,chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,llm-faqgen
-source ./set_env.sh
-```
+   ```bash
+   export LLM_MODEL_ID="meta-llama/Llama-2-7b-chat-hf"
+   ```

-Consult the section on [ChatQnA Service configuration](#chatqna-configuration) for information on how service specific configuration parameters affect deployments.
-
-### Deploy the Services Using Docker Compose
-
-To deploy the ChatQnA services, execute the `docker compose up` command with the appropriate arguments. For a default deployment, execute the command below. It uses the 'compose.yaml' file.
+## Quick Start: 2.Run Docker Compose

 ```bash
 docker compose up -d
@@ -73,54 +66,22 @@ CPU example with Open Telemetry feature:
 docker compose -f compose.yaml -f compose.telemetry.yaml up -d
 ```

-**Note**: developers should build docker image from source when:
+It will automatically download the docker image on `docker hub`:

- Developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
- Unable to download the docker image.
- Use a specific version of Docker image.
-
-Please refer to the table below to build different microservices from source:
-
-| Microservice | Deployment Guide                                                                              |
-| ------------ | --------------------------------------------------------------------------------------------- |
-| Dataprep     | https://github.com/opea-project/GenAIComps/tree/main/comps/dataprep                           |
-| Embedding    | https://github.com/opea-project/GenAIComps/tree/main/comps/embeddings                         |
-| Retriever    | https://github.com/opea-project/GenAIComps/tree/main/comps/retrievers                         |
-| Reranker     | https://github.com/opea-project/GenAIComps/tree/main/comps/rerankings                         |
-| LLM          | https://github.com/opea-project/GenAIComps/tree/main/comps/llms                               |
-| Megaservice  | [Megaservice build guide](../../../../README_miscellaneous.md#build-megaservice-docker-image) |
-| UI           | [Basic UI build guide](../../../../README_miscellaneous.md#build-ui-docker-image)             |
-
-### Check the Deployment Status
-
-After running docker compose, check if all the containers launched via docker compose have started:
-
-```
-docker ps -a
+```bash
+docker pull opea/chatqna:latest
+docker pull opea/chatqna-ui:latest
 ```

-For the default deployment, the following 10 containers should have started:
+NB: You should build docker image from source by yourself if:

-```
-CONTAINER ID   IMAGE                                                   COMMAND                  CREATED        STATUS        PORTS                                                                                  NAMES
-3b5fa9a722da   opea/chatqna-ui:${RELEASE_VERSION}                                  "docker-entrypoint.s…"   32 hours ago   Up 2 hours   0.0.0.0:5173->5173/tcp, :::5173->5173/tcp                                              chatqna-xeon-ui-server
-d3b37f3d1faa   opea/chatqna:${RELEASE_VERSION}                                     "python chatqna.py"      32 hours ago   Up 2 hours   0.0.0.0:8888->8888/tcp, :::8888->8888/tcp                                              chatqna-xeon-backend-server
-b3e1388fa2ca   opea/reranking-tei:${RELEASE_VERSION}                               "python reranking_te…"   32 hours ago   Up 2 hours   0.0.0.0:8000->8000/tcp, :::8000->8000/tcp                                              reranking-tei-xeon-server
-24a240f8ad1c   opea/retriever-redis:${RELEASE_VERSION}                             "python retriever_re…"   32 hours ago   Up 2 hours   0.0.0.0:7000->7000/tcp, :::7000->7000/tcp                                              retriever-redis-server
-9c0d2a2553e8   opea/embedding-tei:${RELEASE_VERSION}                               "python embedding_te…"   32 hours ago   Up 2 hours   0.0.0.0:6000->6000/tcp, :::6000->6000/tcp                                              embedding-tei-server
-24cae0db1a70   opea/llm-vllm:${RELEASE_VERSION}                                    "bash entrypoint.sh"     32 hours ago   Up 2 hours   0.0.0.0:9000->9000/tcp, :::9000->9000/tcp                                              llm-vllm-server
-ea3986c3cf82   opea/dataprep-redis:${RELEASE_VERSION}                              "python prepare_doc_…"   32 hours ago   Up 2 hours   0.0.0.0:6007->6007/tcp, :::6007->6007/tcp                                              dataprep-redis-server
-e10dd14497a8   redis/redis-stack:7.2.0-v9                              "/entrypoint.sh"         32 hours ago   Up 2 hours   0.0.0.0:6379->6379/tcp, :::6379->6379/tcp, 0.0.0.0:8001->8001/tcp, :::8001->8001/tcp   redis-vector-db
-b98fa07a4f5c   opea/vllm:${RELEASE_VERSION}                                        "python3 -m vllm.ent…"   32 hours ago   Up 2 hours   0.0.0.0:9009->80/tcp, :::9009->80/tcp                                                  vllm-service
-79276cf45a47   ghcr.io/huggingface/text-embeddings-inference:cpu-1.2   "text-embeddings-rou…"   32 hours ago   Up 2 hours   0.0.0.0:6006->80/tcp, :::6006->80/tcp                                                  tei-embedding-server
-4943e5f6cd80   ghcr.io/huggingface/text-embeddings-inference:cpu-1.2   "text-embeddings-rou…"   32 hours ago   Up 2 hours   0.0.0.0:8808->80/tcp, :::8808->80/tcp
-```
+- You are developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
+- You can't download the docker image.
+- You want to use a specific version of Docker image.

-If any issues are encountered during deployment, refer to the [troubleshooting](../../../../README_miscellaneous.md##troubleshooting) section.
+Please refer to ['Build Docker Images'](#🚀-build-docker-images) in below.

-### Test the Pipeline
-
-Once the ChatQnA services are running, test the pipeline using the following command. This will send a sample query to the ChatQnA service and return a response.
+## QuickStart: 3.Consume the ChatQnA Service

 ```bash
 curl http://${host_ip}:8888/v1/chatqna \
@@ -130,78 +91,225 @@ curl http://${host_ip}:8888/v1/chatqna \
    }'
 ```

-**Note** : Access the ChatQnA UI by web browser through this URL: `http://${host_ip}:80`. Please confirm the `80` port is opened in the firewall. To validate each microservice used in the pipeline refer to the [Validate microservices](#validate-microservices) section.
+## 🚀 Apply Xeon Server on AWS

-### Cleanup the Deployment
+To apply a Xeon server on AWS, start by creating an AWS account if you don't have one already. Then, head to the [EC2 Console](https://console.aws.amazon.com/ec2/v2/home) to begin the process. Within the EC2 service, select the Amazon EC2 M7i or M7i-flex instance type to leverage 4th Generation Intel Xeon Scalable processors that are optimized for demanding workloads.

-To stop the containers associated with the deployment, execute the following command:
+For detailed information about these instance types, you can refer to this [link](https://aws.amazon.com/ec2/instance-types/m7i/). Once you've chosen the appropriate instance type, proceed with configuring your instance settings, including network configurations, security groups, and storage options.

-```
-docker compose -f compose.yaml down
+After launching your instance, you can connect to it using SSH (for Linux instances) or Remote Desktop Protocol (RDP) (for Windows instances). From there, you'll have full access to your Xeon server, allowing you to install, configure, and manage your applications as needed.
+
+### Network Port & Security
+
+- Access the ChatQnA UI by web browser
+
+  It supports to access by `80` port. Please confirm the `80` port is opened in the firewall of EC2 instance.
+
+- Access the microservice by tool or API
+
+  1. Login to the EC2 instance and access by **local IP address** and port.
+
+     It's recommended and do nothing of the network port setting.
+
+  2. Login to a remote client and access by **public IP address** and port.
+
+     You need to open the port of the microservice in the security group setting of firewall of EC2 instance setting.
+
+     For detailed guide, please refer to [Validate Microservices](#validate-microservices).
+
+     Note, it will increase the risk of security, so please confirm before do it.
+
+## 🚀 Build Docker Images
+
+First of all, you need to build Docker Images locally and install the python package of it.
+
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
 ```

-## ChatQnA Docker Compose Files
+### 1. Build Retriever Image

-In the context of deploying a ChatQnA pipeline on an Intel® Xeon® platform, we can pick and choose different vector databases, large language model serving frameworks, and remove pieces of the pipeline such as the reranker. The table below outlines the various configurations that are available as part of the application. These configurations can be used as templates and can be extended to different components available in [GenAIComps](https://github.com/opea-project/GenAIComps.git).
-
-| File                                                         | Description                                                                                                                                                           |
-| ------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [compose.yaml](./compose.yaml)                               | Default compose file using vllm as serving framework and redis as vector database                                                                                     |
-| [compose_milvus.yaml](./compose_milvus.yaml)                 | Uses Milvus as the vector database. All other configurations remain the same as the default                                                                           |
-| [compose_pinecone.yaml](./compose_pinecone.yaml)             | Uses Pinecone as the vector database. All other configurations remain the same as the default. For more details, refer to [README_pinecone.md](./README_pinecone.md). |
-| [compose_qdrant.yaml](./compose_qdrant.yaml)                 | Uses Qdrant as the vector database. All other configurations remain the same as the default. For more details, refer to [README_qdrant.md](./README_qdrant.md).       |
-| [compose_tgi.yaml](./compose_tgi.yaml)                       | Uses TGI as the LLM serving framework. All other configurations remain the same as the default                                                                        |
-| [compose_without_rerank.yaml](./compose_without_rerank.yaml) | Default configuration without the reranker                                                                                                                            |
-| [compose_faqgen.yaml](./compose_faqgen.yaml)                 | Enables FAQ generation using vLLM as the LLM serving framework. For more details, refer to [README_faqgen.md](./README_faqgen.md).                                    |
-| [compose_faqgen_tgi.yaml](./compose_faqgen_tgi.yaml)         | Enables FAQ generation using TGI as the LLM serving framework. For more details, refer to [README_faqgen.md](./README_faqgen.md).                                     |
-| [compose.telemetry.yaml](./compose.telemetry.yaml)           | Helper file for telemetry features for vllm. Can be used along with any compose files that serves vllm                                                                |
-| [compose_tgi.telemetry.yaml](./compose_tgi.telemetry.yaml)   | Helper file for telemetry features for tgi. Can be used along with any compose files that serves tgi                                                                  |
-
-## ChatQnA with Conversational UI (Optional)
-
-To access the Conversational UI (react based) frontend, modify the UI service in the `compose` file used to deploy. Replace `chaqna-xeon-ui-server` service with the `chatqna-xeon-conversation-ui-server` service as per the config below:
-
-```yaml
-chatqna-xeon-conversation-ui-server:
-  image: opea/chatqna-conversation-ui:latest
-  container_name: chatqna-xeon-conversation-ui-server
-  environment:
-    - APP_BACKEND_SERVICE_ENDPOINT=${BACKEND_SERVICE_ENDPOINT}
-    - APP_DATA_PREP_SERVICE_URL=${DATAPREP_SERVICE_ENDPOINT}
-  ports:
-    - "5174:80"
-  depends_on:
-    - chaqna-xeon-backend-server
-  ipc: host
-  restart: always
+```bash
+docker build --no-cache -t opea/retriever:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/src/Dockerfile .
 ```

-Once the services are up, open the following URL in the browser: http://{host_ip}:5174. By default, the UI runs on port 80 internally. If the developer prefers to use a different host port to access the frontend, it can be modified by port mapping in the `compose.yaml` file as shown below:
+### 2. Build Dataprep Image

-```yaml
-  chaqna-gaudi-conversation-ui-server:
-    image: opea/chatqna-conversation-ui:latest
-    ...
-    ports:
-      - "80:80"
+```bash
+docker build --no-cache -t opea/dataprep:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/src/Dockerfile .
+cd ..
 ```

-Here is an example of running ChatQnA (default UI):
+### 3. Build FaqGen LLM Image (Optional)

-![project-screenshot](../../../../assets/img/chat_ui_response.png)
+If you want to enable FAQ generation LLM in the pipeline, please use the below command:

-Here is an example of running ChatQnA with Conversational UI (React):
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+docker build -t opea/llm-faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/faq-generation/Dockerfile .
+```

-![project-screenshot](../../../../assets/img/conversation_ui_response.png)
+### 4. Build MegaService Docker Image
+
+To construct the Mega Service with Rerank, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `chatqna.py` Python script. Build MegaService Docker image via below command:
+
+```bash
+git clone https://github.com/opea-project/GenAIExamples.git
+cd GenAIExamples/ChatQnA
+docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+
+### 5. Build UI Docker Image
+
+Build frontend Docker image via below command:
+
+```bash
+cd GenAIExamples/ChatQnA/ui
+docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
+```
+
+### 6. Build Conversational React UI Docker Image (Optional)
+
+Build frontend Docker image that enables Conversational experience with ChatQnA megaservice via below command:
+
+**Export the value of the public IP address of your Xeon server to the `host_ip` environment variable**
+
+```bash
+cd GenAIExamples/ChatQnA/ui
+docker build --no-cache -t opea/chatqna-conversation-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
+```
+
+### 7. Build Nginx Docker Image
+
+```bash
+cd GenAIComps
+docker build -t opea/nginx:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/nginx/src/Dockerfile .
+```
+
+Then run the command `docker images`, you will have the following 5 Docker Images:
+
+1. `opea/dataprep:latest`
+2. `opea/retriever:latest`
+3. `opea/chatqna:latest`
+4. `opea/chatqna-ui:latest`
+5. `opea/nginx:latest`
+
+If FaqGen related docker image is built, you will find one more image:
+
+- `opea/llm-faqgen:latest`
+
+## 🚀 Start Microservices
+
+### Required Models
+
+By default, the embedding, reranking and LLM models are set to a default value as listed below:
+
+| Service   | Model                               |
+| --------- | ----------------------------------- |
+| Embedding | BAAI/bge-base-en-v1.5               |
+| Reranking | BAAI/bge-reranker-base              |
+| LLM       | meta-llama/Meta-Llama-3-8B-Instruct |
+
+Change the `xxx_MODEL_ID` below for your needs.
+
+For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. The vLLM/TGI can load the models either online or offline as described below:
+
+1. Online
+
+   ```bash
+   export HF_TOKEN=${your_hf_token}
+   export HF_ENDPOINT="https://hf-mirror.com"
+   model_name="meta-llama/Meta-Llama-3-8B-Instruct"
+   # Start vLLM LLM Service
+   docker run -p 8008:80 -v ./data:/root/.cache/huggingface/hub --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
+   # Start TGI LLM Service
+   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
+   ```
+
+2. Offline
+
+   - Search your model name in ModelScope. For example, check [this page](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/files) for model `Meta-Llama-3-8B-Instruct`.
+
+   - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`.
+
+   - Run the following command to start the LLM service.
+
+     ```bash
+     export HF_TOKEN=${your_hf_token}
+     export model_path="/path/to/model"
+     # Start vLLM LLM Service
+     docker run -p 8008:80 -v $model_path:/root/.cache/huggingface/hub --name vllm-service --shm-size 128g opea/vllm:latest --model /root/.cache/huggingface/hub --host 0.0.0.0 --port 80
+     # Start TGI LLM Service
+     docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
+     ```
+
+### Setup Environment Variables
+
+1. Set the required environment variables:
+
+   ```bash
+   # Example: host_ip="192.168.1.1"
+   export host_ip="External_Public_IP"
+   export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
+   # Example: NGINX_PORT=80
+   export NGINX_PORT=${your_nginx_port}
+   ```
+
+2. If you are in a proxy environment, also set the proxy-related environment variables:
+
+   ```bash
+   export http_proxy="Your_HTTP_Proxy"
+   export https_proxy="Your_HTTPs_Proxy"
+   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+   export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service
+   ```
+
+3. Set up other environment variables:
+
+   ```bash
+   source ./set_env.sh
+   ```
+
+### Start all the services Docker Containers
+
+> Before running the docker compose command, you need to be in the folder that has the docker compose yaml file
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+```
+
+If use vLLM as the LLM serving backend.
+
+```bash
+# Start ChatQnA with Rerank Pipeline
+docker compose -f compose.yaml up -d
+# Start ChatQnA without Rerank Pipeline
+docker compose -f compose_without_rerank.yaml up -d
+# Start ChatQnA with Rerank Pipeline and Open Telemetry Tracing
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+# Start ChatQnA with FaqGen Pipeline
+docker compose -f compose_faqgen.yaml up -d
+```
+
+If use TGI as the LLM serving backend.
+
+```bash
+docker compose -f compose_tgi.yaml up -d
+# Start ChatQnA with Open Telemetry Tracing
+docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml up -d
+# Start ChatQnA with FaqGen Pipeline
+docker compose -f compose_faqgen_tgi.yaml up -d
+```

 ### Validate Microservices

-Note, when verifying the microservices by curl or API from remote client, please make sure the **ports** of the microservices are opened in the firewall of the cloud node.  
+Note, when verify the microservices by curl or API from remote client, please make sure the **ports** of the microservices are opened in the firewall of the cloud node.  
 Follow the instructions to validate MicroServices.
 For details on how to verify the correctness of the response, refer to [how-to-validate_service](../../hpu/gaudi/how_to_validate_service.md).

-1. **TEI Embedding Service**
-   Send a test request to the TEI Embedding Service to ensure it is running correctly:
+1. TEI Embedding Service

   ```bash
   curl http://${host_ip}:6006/embed \
@@ -210,15 +318,13 @@ For details on how to verify the correctness of the response, refer to [how-to-v
       -H 'Content-Type: application/json'
   ```

-   If you receive a connection error, ensure that the service is running and the port 6006 is open in the firewall.
-
-2. **Retriever Microservice**
+2. Retriever Microservice

   To consume the retriever microservice, you need to generate a mock embedding vector by Python script. The length of embedding vector
   is determined by the embedding model.
   Here we use the model `EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"`, which vector size is 768.

-   Check the vector dimension of your embedding model, set `your_embedding` dimension equal to it.
+   Check the vector dimension of your embedding model, set `your_embedding` dimension equals to it.

   ```bash
   export your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
@@ -228,11 +334,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
     -H 'Content-Type: application/json'
   ```

-   If the response indicates an invalid embedding vector, verify that the vector size matches the model's expected dimension.
-
-3. **TEI Reranking Service**
-
-   To test the TEI Reranking Service, use the following `curl` command:
+3. TEI Reranking Service

   > Skip for ChatQnA without Rerank pipeline

@@ -243,7 +345,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
       -H 'Content-Type: application/json'
   ```

-4. **LLM Backend Service**
+4. LLM backend Service

   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.

@@ -273,9 +375,16 @@ For details on how to verify the correctness of the response, refer to [how-to-v
     -H 'Content-Type: application/json'
   ```

-5. **MegaService**
+5. FaqGen LLM Microservice (if enabled)

-   Use the following `curl` command to test the MegaService:
+```bash
+curl http://${host_ip}:${LLM_SERVICE_PORT}/v1/faqgen \
+  -X POST \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
+  -H 'Content-Type: application/json'
+```
+
+6. MegaService

   ```bash
    curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
@@ -283,9 +392,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
          }'
   ```

-6. **Nginx Service**
-
-   Use the following curl command to test the Nginx Service:
+7. Nginx Service

   ```bash
   curl http://${host_ip}:${NGINX_PORT}/v1/chatqna \
@@ -293,84 +400,84 @@ For details on how to verify the correctness of the response, refer to [how-to-v
       -d '{"messages": "What is the revenue of Nike in 2023?"}'
   ```

-7. **Dataprep Microservice(Optional) **
+8. Dataprep Microservice（Optional）

-   If you want to update the default knowledge base, you can use the following commands:
+If you want to update the default knowledge base, you can use the following commands:

-   Update Knowledge Base via Local File [nke-10k-2023.pdf](https://github.com/opea-project/GenAIComps/blob/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf). Or
-   click [here](https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf) to download the file via any web browser.
-   Or run this command to get the file on a terminal.
+Update Knowledge Base via Local File [nke-10k-2023.pdf](https://github.com/opea-project/GenAIComps/blob/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf). Or
+click [here](https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf) to download the file via any web browser.
+Or run this command to get the file on a terminal.

-   ```bash
-   wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf
-   ```
+```bash
+wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf
+```

-   Upload:
+Upload:

-   ```bash
-   curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \
-       -H "Content-Type: multipart/form-data" \
-       -F "files=@./nke-10k-2023.pdf"
-   ```
+```bash
+curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \
+     -H "Content-Type: multipart/form-data" \
+     -F "files=@./nke-10k-2023.pdf"
+```

-   This command updates a knowledge base by uploading a local file for processing. Update the file path according to your environment.
+This command updates a knowledge base by uploading a local file for processing. Update the file path according to your environment.

-   Add Knowledge Base via HTTP Links:
+Add Knowledge Base via HTTP Links:

-   ```bash
-   curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \
-       -H "Content-Type: multipart/form-data" \
-       -F 'link_list=["https://opea.dev"]'
-   ```
+```bash
+curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \
+     -H "Content-Type: multipart/form-data" \
+     -F 'link_list=["https://opea.dev"]'
+```

-   This command updates a knowledge base by submitting a list of HTTP links for processing.
+This command updates a knowledge base by submitting a list of HTTP links for processing.

-   Also, you are able to get the file list that you uploaded:
+Also, you are able to get the file list that you uploaded:

-   ```bash
-   curl -X POST "http://${host_ip}:6007/v1/dataprep/get" \
-       -H "Content-Type: application/json"
-   ```
+```bash
+curl -X POST "http://${host_ip}:6007/v1/dataprep/get" \
+     -H "Content-Type: application/json"
+```

-   Then you will get the response JSON like this. Notice that the returned `name`/`id` of the uploaded link is `https://xxx.txt`.
+Then you will get the response JSON like this. Notice that the returned `name`/`id` of the uploaded link is `https://xxx.txt`.

-   ```json
-   [
-     {
-       "name": "nke-10k-2023.pdf",
-       "id": "nke-10k-2023.pdf",
-       "type": "File",
-       "parent": ""
-     },
-     {
-       "name": "https://opea.dev.txt",
-       "id": "https://opea.dev.txt",
-       "type": "File",
-       "parent": ""
-     }
-   ]
-   ```
+```json
+[
+  {
+    "name": "nke-10k-2023.pdf",
+    "id": "nke-10k-2023.pdf",
+    "type": "File",
+    "parent": ""
+  },
+  {
+    "name": "https://opea.dev.txt",
+    "id": "https://opea.dev.txt",
+    "type": "File",
+    "parent": ""
+  }
+]
+```

-   To delete the file/link you uploaded:
+To delete the file/link you uploaded:

-   The `file_path` here should be the `id` get from `/v1/dataprep/get` API.
+The `file_path` here should be the `id` get from `/v1/dataprep/get` API.

-   ```bash
-   # delete link
-   curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
-       -d '{"file_path": "https://opea.dev.txt"}' \
-       -H "Content-Type: application/json"
+```bash
+# delete link
+curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
+     -d '{"file_path": "https://opea.dev.txt"}' \
+     -H "Content-Type: application/json"

-   # delete file
-   curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
-       -d '{"file_path": "nke-10k-2023.pdf"}' \
-       -H "Content-Type: application/json"
+# delete file
+curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
+     -d '{"file_path": "nke-10k-2023.pdf"}' \
+     -H "Content-Type: application/json"

-   # delete all uploaded files and links
-   curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
-       -d '{"file_path": "all"}' \
-       -H "Content-Type: application/json"
-   ```
+# delete all uploaded files and links
+curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
+     -d '{"file_path": "all"}' \
+     -H "Content-Type: application/json"
+```

 ### Profile Microservices

@@ -402,7 +509,7 @@ After vLLM profiling is started, users could start asking questions and get resp

 ##### Stop vLLM profiling

-By following command, users could stop vLLM profiling and generate a \*.pt.trace.json.gz file as profiling result  
+By following command, users could stop vLLM profliing and generate a \*.pt.trace.json.gz file as profiling result  
 under /mnt folder in vllm-service docker instance.

 ```bash
@@ -432,6 +539,59 @@ Open a web browser and type "chrome://tracing" or "ui.perfetto.dev", and then lo
 to see the vLLM profiling result as below diagram.
 ![image](https://github.com/user-attachments/assets/55c7097e-5574-41dc-97a7-5e87c31bc286)

-## Conclusion
+## 🚀 Launch the UI

-This guide should enable developer to deploy the default configuration or any of the other compose yaml files for different configurations. It also highlights the configurable parameters that can be set before deployment.
+### Launch with origin port
+
+To access the frontend, open the following URL in your browser: http://{host_ip}:5173. By default, the UI runs on port 5173 internally. If you prefer to use a different host port to access the frontend, you can modify the port mapping in the `compose.yaml` file as shown below:
+
+```yaml
+  chaqna-gaudi-ui-server:
+    image: opea/chatqna-ui:latest
+    ...
+    ports:
+      - "80:5173"
+```
+
+### Launch with Nginx
+
+If you want to launch the UI using Nginx, open this URL: `http://${host_ip}:${NGINX_PORT}` in your browser to access the frontend.
+
+## 🚀 Launch the Conversational UI (Optional)
+
+To access the Conversational UI (react based) frontend, modify the UI service in the `compose.yaml` file. Replace `chaqna-xeon-ui-server` service with the `chatqna-xeon-conversation-ui-server` service as per the config below:
+
+```yaml
+chaqna-xeon-conversation-ui-server:
+  image: opea/chatqna-conversation-ui:latest
+  container_name: chatqna-xeon-conversation-ui-server
+  environment:
+    - APP_BACKEND_SERVICE_ENDPOINT=${BACKEND_SERVICE_ENDPOINT}
+    - APP_DATA_PREP_SERVICE_URL=${DATAPREP_SERVICE_ENDPOINT}
+  ports:
+    - "5174:80"
+  depends_on:
+    - chaqna-xeon-backend-server
+  ipc: host
+  restart: always
+```
+
+Once the services are up, open the following URL in your browser: http://{host_ip}:5174. By default, the UI runs on port 80 internally. If you prefer to use a different host port to access the frontend, you can modify the port mapping in the `compose.yaml` file as shown below:
+
+```yaml
+  chaqna-gaudi-conversation-ui-server:
+    image: opea/chatqna-conversation-ui:latest
+    ...
+    ports:
+      - "80:80"
+```
+
+![project-screenshot](../../../../assets/img/chat_ui_init.png)
+
+Here is an example of running ChatQnA:
+
+![project-screenshot](../../../../assets/img/chat_ui_response.png)
+
+Here is an example of running ChatQnA with Conversational UI (React):
+
+![project-screenshot](../../../../assets/img/conversation_ui_response.png)
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README_faqgen.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_faqgen.md
@@ -1,227 +0,0 @@
-# Deploying FAQ Generation on Intel® Xeon® Processors
-
-In today's data-driven world, organizations across various industries face the challenge of managing and understanding vast amounts of information. Legal documents, contracts, regulations, and customer inquiries often contain critical insights buried within dense text. Extracting and presenting these insights in a concise and accessible format is crucial for decision-making, compliance, and customer satisfaction.
-
-Our FAQ Generation Application leverages the power of large language models (LLMs) to revolutionize the way you interact with and comprehend complex textual data. By harnessing cutting-edge natural language processing techniques, our application can automatically generate comprehensive and natural-sounding frequently asked questions (FAQs) from your documents, legal texts, customer queries, and other sources. In this example use case, we utilize LangChain to implement FAQ Generation and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors.
-
-The FaqGen example is implemented using the component-level microservices defined in [GenAIComps](https://github.com/opea-project/GenAIComps). The flow chart below shows the information flow between different microservices for this example.
-
-```mermaid
---
-config:
-  flowchart:
-    nodeSpacing: 400
-    rankSpacing: 100
-    curve: linear
-  themeVariables:
-    fontSize: 50px
---
-flowchart LR
-    %% Colors %%
-    classDef blue fill:#ADD8E6,stroke:#ADD8E6,stroke-width:2px,fill-opacity:0.5
-    classDef orange fill:#FBAA60,stroke:#ADD8E6,stroke-width:2px,fill-opacity:0.5
-    classDef orchid fill:#C26DBC,stroke:#ADD8E6,stroke-width:2px,fill-opacity:0.5
-    classDef invisible fill:transparent,stroke:transparent;
-    style FaqGen-MegaService stroke:#000000
-    %% Subgraphs %%
-    subgraph FaqGen-MegaService["FaqGen MegaService "]
-        direction LR
-        LLM([LLM MicroService]):::blue
-    end
-    subgraph UserInterface[" User Interface "]
-        direction LR
-        a([User Input Query]):::orchid
-        UI([UI server<br>]):::orchid
-    end
-    LLM_gen{{LLM Service <br>}}
-    GW([FaqGen GateWay<br>]):::orange
-    %% Questions interaction
-    direction LR
-    a[User Input Query] --> UI
-    UI --> GW
-    GW <==> FaqGen-MegaService
-    %% Embedding service flow
-    direction LR
-    LLM <-.-> LLM_gen
-```
-
---
-
-## Table of Contents
-
-1. [Build Docker Images](#build-docker-images)
-2. [Validate Microservices](#validate-microservices)
-3. [Launch the UI](#launch-the-ui)
-4. [Launch the Conversational UI (Optional)](#launch-the-conversational-ui-optional)
-
---
-
-## Build Docker Images
-
-First of all, you need to build Docker Images locally. This step can be ignored once the Docker images are published to Docker hub.
-
-### 1. Build vLLM Image
-
-```bash
-git clone https://github.com/vllm-project/vllm.git
-cd ./vllm/
-VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
-git checkout ${VLLM_VER}
-docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.cpu -t opea/vllm:latest --shm-size=128g .
-```
-
-### 2. Build LLM Image
-
-```bash
-git clone https://github.com/opea-project/GenAIComps.git
-cd GenAIComps
-docker build -t opea/llm-faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/faq-generation/Dockerfile .
-```
-
-### 3. Build MegaService Docker Image
-
-To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `faqgen.py` Python script. Build the MegaService Docker image via below command:
-
-```bash
-git clone https://github.com/opea-project/GenAIExamples
-cd GenAIExamples/ChatQnA
-docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-```
-
-### 4. Build UI Docker Image
-
-Build frontend Docker image via below command:
-
-```bash
-cd GenAIExamples/ChatQnA/ui
-docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
-```
-
-### 5. Build Conversational React UI Docker Image (Optional)
-
-Build frontend Docker image that enables Conversational experience with ChatQnA megaservice via below command:
-
-**Export the value of the public IP address of your Xeon server to the `host_ip` environment variable**
-
-```bash
-cd GenAIExamples/ChatQnA/ui
-docker build --no-cache -t opea/chatqna-conversation-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
-```
-
-### 6. Build Nginx Docker Image
-
-```bash
-cd GenAIComps
-docker build -t opea/nginx:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/nginx/src/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/vllm:latest`
-2. `opea/llm-faqgen:latest`
-3. `opea/chatqna:latest`
-4. `opea/chatqna-ui:latest`
-5. `opea/nginx:latest`
-
-## Start Microservices and MegaService
-
-### Required Models
-
-We set default model as "meta-llama/Meta-Llama-3-8B-Instruct", change "LLM_MODEL_ID" in following Environment Variables setting if you want to use other models.
-
-If use gated models, you also need to provide [huggingface token](https://huggingface.co/docs/hub/security-tokens) to "HUGGINGFACEHUB_API_TOKEN" environment variable.
-
-### Setup Environment Variables
-
-Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
-
-```bash
-export no_proxy=${your_no_proxy}
-export http_proxy=${your_http_proxy}
-export https_proxy=${your_http_proxy}
-export host_ip=${your_host_ip}
-export LLM_ENDPOINT_PORT=8008
-export LLM_SERVICE_PORT=9000
-export FAQGEN_BACKEND_PORT=8888
-export FAQGen_COMPONENT_NAME="OpeaFaqGenvLLM"
-export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
-export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export MEGA_SERVICE_HOST_IP=${host_ip}
-export LLM_SERVICE_HOST_IP=${host_ip}
-export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/faqgen"
-```
-
-Note: Please replace with `your_host_ip` with your external IP address, do not use localhost.
-
-### Start Microservice Docker Containers
-
-```bash
-cd GenAIExamples/FaqGen/docker_compose/intel/cpu/xeon
-docker compose up -d
-```
-
-### Validate Microservices
-
-1. vLLM Service
-
-```bash
-curl http://${host_ip}:${LLM_ENDPOINT_PORT}/v1/chat/completions \
-  -X POST \
-  -H "Content-Type: application/json" \
-  -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
-```
-
-2. LLM Microservice
-
-```bash
-curl http://${host_ip}:${LLM_SERVICE_PORT}/v1/faqgen \
-  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
-  -H 'Content-Type: application/json'
-```
-
-3. MegaService
-
-```bash
-curl http://${host_ip}:${FAQGEN_BACKEND_PORT}/v1/faqgen \
-  -H "Content-Type: multipart/form-data" \
-  -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
-  -F "max_tokens=32" \
-  -F "stream=False"
-```
-
-```bash
-## enable stream
-curl http://${host_ip}:${FAQGEN_BACKEND_PORT}/v1/faqgen \
-  -H "Content-Type: multipart/form-data" \
-  -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
-  -F "max_tokens=32" \
-  -F "stream=True"
-```
-
-Following the validation of all aforementioned microservices, we are now prepared to construct a mega-service.
-
-## Launch the UI
-
-To access the frontend, open the following URL in your browser: http://{host_ip}:5173. By default, the UI runs on port 5173 internally. If you prefer to use a different host port to access the frontend, you can modify the port mapping in the `compose.yaml` file as shown below:
-
-```yaml
-  chaqna-gaudi-ui-server:
-    image: opea/chatqna-ui:latest
-    ...
-    ports:
-      - "80:5173"
-```
-
-## Launch the Conversational UI (Optional)
-
-To access the Conversational UI frontend, open the following URL in your browser: http://{host_ip}:5174. By default, the UI runs on port 80 internally. If you prefer to use a different host port to access the frontend, you can modify the port mapping in the `compose.yaml` file as shown below:
-
-```yaml
-  chaqna-xeon-conversation-ui-server:
-    image: opea/chatqna-conversation-ui:latest
-    ...
-    ports:
-      - "80:80"
-```
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md
@@ -1,22 +1,18 @@
-# Deploying ChatQnA with Pinecone on Intel® Xeon® Processors
+# Build Mega Service of ChatQnA on Xeon

-This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel® Xeon® servers. The pipeline integrates **Pinecone** as the vector database (VectorDB) and includes microservices such as `embedding`, `retriever`, `rerank`, and `llm`.
+This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`.

---
+The default pipeline deploys with vLLM as the LLM serving component and leverages rerank component.

-## Table of Contents
+Quick Start:

-1. [Quick Start](#quick-start)
-2. [Build Docker Images](#build-docker-images)
-3. [Validate Microservices](#validate-microservices)
-4. [Launch the UI](#launch-the-ui)
-5. [Launch the Conversational UI (Optional)](#launch-the-conversational-ui-optional)
+1. Set up the environment variables.
+2. Run Docker Compose.
+3. Consume the ChatQnA Service.

---
+Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).

-## Quick Start
-
-### 1.Set up Environment Variable
+## Quick Start: 1.Setup Environment Variable

 To set up environment variables for deploying ChatQnA services, follow these steps:

@@ -35,8 +31,8 @@ To set up environment variables for deploying ChatQnA services, follow these ste

   ```bash
   export http_proxy="Your_HTTP_Proxy"
-   export https_proxy="Your_HTTPS_Proxy"
-   # Example: no_proxy="localhost,127.0.0.1,192.168.1.1"
+   export https_proxy="Your_HTTPs_Proxy"
+   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
   export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-pinecone-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service
   ```

@@ -45,28 +41,28 @@ To set up environment variables for deploying ChatQnA services, follow these ste
   source ./set_env.sh
   ```

-### 2.Run Docker Compose
+## Quick Start: 2.Run Docker Compose

 ```bash
 docker compose -f compose_pinecone.yaml up -d
 ```

-It will automatically download the Docker image on `Docker hub`:
+It will automatically download the docker image on `docker hub`:

 ```bash
 docker pull opea/chatqna:latest
 docker pull opea/chatqna-ui:latest
 ```

-Note: You should build docker image from source by yourself if:
+NB: You should build docker image from source by yourself if:

 - You are developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
 - You can't download the docker image.
 - You want to use a specific version of Docker image.

-Please refer to ['Build Docker Images'](#build-docker-images) in below.
+Please refer to ['Build Docker Images'](#🚀-build-docker-images) in below.

-### 3.Consume the ChatQnA Service
+## QuickStart: 3.Consume the ChatQnA Service

 ```bash
 curl http://${host_ip}:8888/v1/chatqna \
@@ -76,7 +72,35 @@ curl http://${host_ip}:8888/v1/chatqna \
    }'
 ```

-## Build Docker Images
+## 🚀 Apply Xeon Server on AWS
+
+To apply a Xeon server on AWS, start by creating an AWS account if you don't have one already. Then, head to the [EC2 Console](https://console.aws.amazon.com/ec2/v2/home) to begin the process. Within the EC2 service, select the Amazon EC2 M7i or M7i-flex instance type to leverage 4th Generation Intel Xeon Scalable processors that are optimized for demanding workloads.
+
+For detailed information about these instance types, you can refer to this [link](https://aws.amazon.com/ec2/instance-types/m7i/). Once you've chosen the appropriate instance type, proceed with configuring your instance settings, including network configurations, security groups, and storage options.
+
+After launching your instance, you can connect to it using SSH (for Linux instances) or Remote Desktop Protocol (RDP) (for Windows instances). From there, you'll have full access to your Xeon server, allowing you to install, configure, and manage your applications as needed.
+
+### Network Port & Security
+
+- Access the ChatQnA UI by web browser
+
+  It supports to access by `80` port. Please confirm the `80` port is opened in the firewall of EC2 instance.
+
+- Access the microservice by tool or API
+
+  1. Login to the EC2 instance and access by **local IP address** and port.
+
+     It's recommended and do nothing of the network port setting.
+
+  2. Login to a remote client and access by **public IP address** and port.
+
+     You need to open the port of the microservice in the security group setting of firewall of EC2 instance setting.
+
+     For detailed guide, please refer to [Validate Microservices](#validate-microservices).
+
+     Note, it will increase the risk of security, so please confirm before do it.
+
+## 🚀 Build Docker Images

 First of all, you need to build Docker Images locally and install the python package of it.

@@ -194,7 +218,7 @@ For users in China who are unable to download models directly from Huggingface,
     docker run -p 8008:80 -v $model_path:/root/.cache/huggingface/hub --name vllm-service --shm-size 128g opea/vllm:latest --model /root/.cache/huggingface/hub --host 0.0.0.0 --port 80
     ```

-### Set up Environment Variables
+### Setup Environment Variables

 1. Set the required environment variables:

@@ -239,7 +263,7 @@ If use vLLM backend.
 docker compose -f compose_pinecone.yaml up -d
 ```

-## Validate Microservices
+### Validate Microservices

 Note, when verify the microservices by curl or API from remote client, please make sure the **ports** of the microservices are opened in the firewall of the cloud node.
 Follow the instructions to validate MicroServices.
@@ -359,12 +383,12 @@ To delete the files/link you uploaded:

 ```bash
 # delete all uploaded files and links
-curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
+curl -X POST "http://${host_ip}:6009/v1/dataprep/delete" \
     -d '{"file_path": "all"}' \
     -H "Content-Type: application/json"
 ```

-## Launch the UI
+## 🚀 Launch the UI

 ### Launch with origin port

@@ -382,7 +406,7 @@ To access the frontend, open the following URL in your browser: http://{host_ip}

 If you want to launch the UI using Nginx, open this URL: `http://${host_ip}:${NGINX_PORT}` in your browser to access the frontend.

-## Launch the Conversational UI (Optional)
+## 🚀 Launch the Conversational UI (Optional)

 To access the Conversational UI (react based) frontend, modify the UI service in the `compose.yaml` file. Replace `chaqna-xeon-ui-server` service with the `chatqna-xeon-conversation-ui-server` service as per the config below:

--- a/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md
@@ -1,19 +1,71 @@
-# Deploying ChatQnA with Qdrant on Intel® Xeon® Processors
+# Build Mega Service of ChatQnA (with Qdrant) on Xeon

-This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel® Xeon® servers. The pipeline integrates **Qdrant** as the vector database (VectorDB) and includes microservices such as `embedding`, `retriever`, `rerank`, and `llm`.
+This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`.

---
+The default pipeline deploys with vLLM as the LLM serving component and leverages rerank component.

-## Table of Contents
+Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).

-1. [Build Docker Images](#build-docker-images)
-2. [Validate Microservices](#validate-microservices)
-3. [Launch the UI](#launch-the-ui)
-4. [Launch the Conversational UI (Optional)](#launch-the-conversational-ui-optional)
+## 🚀 Apply Xeon Server on AWS

---
+To apply a Xeon server on AWS, start by creating an AWS account if you don't have one already. Then, head to the [EC2 Console](https://console.aws.amazon.com/ec2/v2/home) to begin the process. Within the EC2 service, select the Amazon EC2 M7i or M7i-flex instance type to leverage the power of 4th Generation Intel Xeon Scalable processors. These instances are optimized for high-performance computing and demanding workloads.

-## Build Docker Images
+For detailed information about these instance types, you can refer to this [link](https://aws.amazon.com/ec2/instance-types/m7i/). Once you've chosen the appropriate instance type, proceed with configuring your instance settings, including network configurations, security groups, and storage options.
+
+After launching your instance, you can connect to it using SSH (for Linux instances) or Remote Desktop Protocol (RDP) (for Windows instances). From there, you'll have full access to your Xeon server, allowing you to install, configure, and manage your applications as needed.
+
+**Certain ports in the EC2 instance need to opened up in the security group, for the microservices to work with the curl commands**
+
+> See one example below. Please open up these ports in the EC2 instance based on the IP addresses you want to allow
+
+```
+qdrant-vector-db
+===============
+Port 6333 - Open to 0.0.0.0/0
+Port 6334 - Open to 0.0.0.0/0
+
+dataprep-qdrant-server
+======================
+Port 6043 - Open to 0.0.0.0/0
+
+tei_embedding_service
+=====================
+Port 6040 - Open to 0.0.0.0/0
+
+embedding
+=========
+Port 6044 - Open to 0.0.0.0/0
+
+retriever
+=========
+Port 6045 - Open to 0.0.0.0/0
+
+tei_reranking_service
+================
+Port 6041 - Open to 0.0.0.0/0
+
+reranking
+=========
+Port 6046 - Open to 0.0.0.0/0
+
+vllm-service
+===========
+Port 6042 - Open to 0.0.0.0/0
+
+llm
+===
+Port 6047 - Open to 0.0.0.0/0
+
+chaqna-xeon-backend-server
+==========================
+Port 8912 - Open to 0.0.0.0/0
+
+chaqna-xeon-ui-server
+=====================
+Port 5173 - Open to 0.0.0.0/0
+```
+
+## 🚀 Build Docker Images

 First of all, you need to build Docker Images locally and install the python package of it.

@@ -85,7 +137,7 @@ Then run the command `docker images`, you will have the following 5 Docker Image
 4. `opea/chatqna-ui:latest`
 5. `opea/nginx:latest`

-## Start Microservices
+## 🚀 Start Microservices

 ### Required Models

@@ -240,7 +292,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
        -F 'link_list=["https://opea.dev"]'
   ```

-## Launch the UI
+## 🚀 Launch the UI

 To access the frontend, open the following URL in your browser: http://{host_ip}:5173. By default, the UI runs on port 5173 internally. If you prefer to use a different host port to access the frontend, you can modify the port mapping in the `compose.yaml` file as shown below:

@@ -252,7 +304,7 @@ To access the frontend, open the following URL in your browser: http://{host_ip}
      - "80:5173"
 ```

-## Launch the Conversational UI (Optional)
+## 🚀 Launch the Conversational UI (react)

 To access the Conversational UI frontend, open the following URL in your browser: http://{host_ip}:5174. By default, the UI runs on port 80 internally. If you prefer to use a different host port to access the frontend, you can modify the port mapping in the `compose.yaml` file as shown below:

--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -32,12 +32,6 @@ services:
      INDEX_NAME: ${INDEX_NAME}
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
-      interval: 10s
-      timeout: 5s
-      retries: 50
-    restart: unless-stopped
  tei-embedding-service:
    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
    container_name: tei-embedding-server
@@ -102,7 +96,6 @@ services:
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      LLM_MODEL_ID: ${LLM_MODEL_ID}
      VLLM_TORCH_PROFILER_DIR: "/mnt"
-      VLLM_CPU_KVCACHE_SPACE: 40
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://$host_ip:9009/health || exit 1"]
      interval: 10s
@@ -113,18 +106,11 @@ services:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-xeon-backend-server
    depends_on:
-      redis-vector-db:
-        condition: service_started
-      dataprep-redis-service:
-        condition: service_healthy
-      tei-embedding-service:
-        condition: service_started
-      retriever:
-        condition: service_started
-      tei-reranking-service:
-        condition: service_started
-      vllm-service:
-        condition: service_healthy
+      - redis-vector-db
+      - tei-embedding-service
+      - retriever
+      - tei-reranking-service
+      - vllm-service
    ports:
      - "8888:8888"
    environment:
@@ -138,7 +124,7 @@ services:
      - RERANK_SERVER_HOST_IP=tei-reranking-service
      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
      - LLM_SERVER_HOST_IP=vllm-service
-      - LLM_SERVER_PORT=80
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
      - LLM_MODEL=${LLM_MODEL_ID}
      - LOGFLAG=${LOGFLAG}
    ipc: host
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen.yaml
@@ -25,12 +25,6 @@ services:
      INDEX_NAME: ${INDEX_NAME}
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
-      interval: 10s
-      timeout: 5s
-      retries: 50
-    restart: unless-stopped
  tei-embedding-service:
    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
    container_name: tei-embedding-server
@@ -127,20 +121,12 @@ services:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-xeon-backend-server
    depends_on:
-      redis-vector-db:
-        condition: service_started
-      tei-embedding-service:
-        condition: service_started
-      retriever:
-        condition: service_started
-      tei-reranking-service:
-        condition: service_started
-      vllm-service:
-        condition: service_started
-      llm-faqgen:
-        condition: service_started
-      dataprep-redis-service:
-        condition: service_healthy
+      - redis-vector-db
+      - tei-embedding-service
+      - retriever
+      - tei-reranking-service
+      - vllm-service
+      - llm-faqgen
    ports:
      - ${CHATQNA_BACKEND_PORT:-8888}:8888
    environment:
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen_tgi.yaml
@@ -25,12 +25,6 @@ services:
      INDEX_NAME: ${INDEX_NAME}
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
-      interval: 10s
-      timeout: 5s
-      retries: 50
-    restart: unless-stopped
  tei-embedding-service:
    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
    container_name: tei-embedding-server
@@ -127,20 +121,12 @@ services:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-xeon-backend-server
    depends_on:
-      redis-vector-db:
-        condition: service_started
-      tei-embedding-service:
-        condition: service_started
-      retriever:
-        condition: service_started
-      tei-reranking-service:
-        condition: service_started
-      tgi-service:
-        condition: service_started
-      llm-faqgen:
-        condition: service_started
-      dataprep-redis-service:
-        condition: service_healthy
+      - redis-vector-db
+      - tei-embedding-service
+      - retriever
+      - tei-reranking-service
+      - tgi-service
+      - llm-faqgen
    ports:
      - ${CHATQNA_BACKEND_PORT:-8888}:8888
    environment:
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
@@ -78,11 +78,6 @@ services:
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      EMBEDDING_MODEL_ID: ${EMBEDDING_MODEL_ID}
      LOGFLAG: ${LOGFLAG}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
-      interval: 10s
-      timeout: 5s
-      retries: 50
    restart: unless-stopped
    depends_on:
      milvus-standalone:
@@ -159,7 +154,7 @@ services:
      LLM_MODEL_ID: ${LLM_MODEL_ID}
      VLLM_TORCH_PROFILER_DIR: "/mnt"
    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
+      test: ["CMD-SHELL", "curl -f http://$host_ip:9009/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
@@ -169,16 +164,12 @@ services:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-xeon-backend-server
    depends_on:
-      tei-embedding-service:
-        condition: service_started
-      dataprep-milvus-service:
-        condition: service_healthy
-      retriever:
-        condition: service_started
-      tei-reranking-service:
-        condition: service_started
-      vllm-service:
-        condition: service_healthy
+      - milvus-standalone
+      - tei-embedding-service
+      - dataprep-milvus-service
+      - retriever
+      - tei-reranking-service
+      - vllm-service
    ports:
      - "8888:8888"
    environment:
@@ -192,7 +183,7 @@ services:
      - RERANK_SERVER_HOST_IP=tei-reranking-service
      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
      - LLM_SERVER_HOST_IP=vllm-service
-      - LLM_SERVER_PORT=80
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
      - LLM_MODEL=${LLM_MODEL_ID}
      - LOGFLAG=${LOGFLAG}
    ipc: host
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
@@ -22,12 +22,6 @@ services:
      LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY}
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_PINECONE"
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
-      interval: 10s
-      timeout: 5s
-      retries: 50
-    restart: unless-stopped
  tei-embedding-service:
    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
    container_name: tei-embedding-server
@@ -95,16 +89,11 @@ services:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-xeon-backend-server
    depends_on:
-      tei-embedding-service:
-        condition: service_started
-      dataprep-pinecone-service:
-        condition: service_healthy
-      retriever:
-        condition: service_started
-      tei-reranking-service:
-        condition: service_started
-      vllm-service:
-        condition: service_started
+      - tei-embedding-service
+      - dataprep-pinecone-service
+      - retriever
+      - tei-reranking-service
+      - vllm-service
    ports:
      - "8888:8888"
    environment:
@@ -118,7 +107,7 @@ services:
      - RERANK_SERVER_HOST_IP=tei-reranking-service
      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
      - LLM_SERVER_HOST_IP=vllm-service
-      - LLM_SERVER_PORT=80
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
      - LOGFLAG=${LOGFLAG}
      - LLM_MODEL=${LLM_MODEL_ID}
    ipc: host
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
@@ -26,12 +26,6 @@ services:
      TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_QDRANT"
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
-      interval: 10s
-      timeout: 5s
-      retries: 50
-    restart: unless-stopped
  tei-embedding-service:
    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-server
@@ -100,18 +94,11 @@ services:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-xeon-backend-server
    depends_on:
-      qdrant-vector-db:
-        condition: service_started
-      dataprep-qdrant-service:
-        condition: service_healthy
-      tei-embedding-service:
-        condition: service_started
-      retriever:
-        condition: service_started
-      tei-reranking-service:
-        condition: service_started
-      vllm-service:
-        condition: service_started
+      - qdrant-vector-db
+      - tei-embedding-service
+      - retriever
+      - tei-reranking-service
+      - vllm-service
    ports:
      - "8912:8888"
    environment:
@@ -126,7 +113,7 @@ services:
      - RERANK_SERVER_HOST_IP=tei-reranking-service
      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
      - LLM_SERVER_HOST_IP=vllm-service
-      - LLM_SERVER_PORT=80
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
      - LLM_MODEL=${LLM_MODEL_ID}
      - LOGFLAG=${LOGFLAG}
    ipc: host
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_remote.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_remote.yaml
@@ -1,148 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-services:
-  redis-vector-db:
-    image: redis/redis-stack:7.2.0-v9
-    container_name: redis-vector-db
-    ports:
-      - "6379:6379"
-      - "8001:8001"
-    healthcheck:
-      test: ["CMD", "redis-cli", "ping"]
-      interval: 5s
-      timeout: 3s
-      retries: 10
-  dataprep-redis-service:
-    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
-    container_name: dataprep-redis-server
-    depends_on:
-      redis-vector-db:
-        condition: service_healthy
-      tei-embedding-service:
-        condition: service_started
-    ports:
-      - "6007:5000"
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      REDIS_URL: redis://redis-vector-db:6379
-      REDIS_HOST: redis-vector-db
-      INDEX_NAME: ${INDEX_NAME}
-      TEI_ENDPOINT: http://tei-embedding-service:80
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
-    container_name: tei-embedding-server
-    ports:
-      - "6006:80"
-    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
-    shm_size: 1g
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
-  retriever:
-    image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
-    container_name: retriever-redis-server
-    depends_on:
-      - redis-vector-db
-    ports:
-      - "7000:7000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      REDIS_URL: redis://redis-vector-db:6379
-      REDIS_HOST: redis-vector-db
-      INDEX_NAME: ${INDEX_NAME}
-      TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      LOGFLAG: ${LOGFLAG}
-      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
-    restart: unless-stopped
-  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
-    container_name: tei-reranking-server
-    ports:
-      - "8808:80"
-    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
-    shm_size: 1g
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  chatqna-xeon-backend-server:
-    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
-    container_name: chatqna-xeon-backend-server
-    depends_on:
-      - redis-vector-db
-      - tei-embedding-service
-      - retriever
-      - tei-reranking-service
-    ports:
-      - "8888:8888"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - MEGA_SERVICE_HOST_IP=chatqna-xeon-backend-server
-      - EMBEDDING_SERVER_HOST_IP=tei-embedding-service
-      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-80}
-      - RETRIEVER_SERVICE_HOST_IP=retriever
-      - RERANK_SERVER_HOST_IP=tei-reranking-service
-      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
-      - LLM_SERVER_HOST_IP=${REMOTE_ENDPOINT}
-      - OPENAI_API_KEY= ${OPENAI_API_KEY}
-      - LLM_SERVER_PORT=80
-      - LLM_MODEL=${LLM_MODEL_ID}
-      - LOGFLAG=${LOGFLAG}
-    ipc: host
-    restart: always
-  chatqna-xeon-ui-server:
-    image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest}
-    container_name: chatqna-xeon-ui-server
-    depends_on:
-      - chatqna-xeon-backend-server
-    ports:
-      - "5173:5173"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-    ipc: host
-    restart: always
-  chatqna-xeon-nginx-server:
-    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
-    container_name: chatqna-xeon-nginx-server
-    depends_on:
-      - chatqna-xeon-backend-server
-      - chatqna-xeon-ui-server
-    ports:
-      - "${NGINX_PORT:-80}:80"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - FRONTEND_SERVICE_IP=chatqna-xeon-ui-server
-      - FRONTEND_SERVICE_PORT=5173
-      - BACKEND_SERVICE_NAME=chatqna
-      - BACKEND_SERVICE_IP=chatqna-xeon-backend-server
-      - BACKEND_SERVICE_PORT=8888
-      - DATAPREP_SERVICE_IP=dataprep-redis-service
-      - DATAPREP_SERVICE_PORT=5000
-    ipc: host
-    restart: always
-
-networks:
-  default:
-    driver: bridge
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -25,12 +25,6 @@ services:
      INDEX_NAME: ${INDEX_NAME}
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
-      interval: 10s
-      timeout: 5s
-      retries: 50
-    restart: unless-stopped
  tei-embedding-service:
    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
    container_name: tei-embedding-server
@@ -100,18 +94,12 @@ services:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-xeon-backend-server
    depends_on:
-      redis-vector-db:
-        condition: service_started
-      tei-embedding-service:
-        condition: service_started
-      dataprep-redis-service:
-        condition: service_healthy
-      retriever:
-        condition: service_started
-      tei-reranking-service:
-        condition: service_started
-      tgi-service:
-        condition: service_started
+      - redis-vector-db
+      - tei-embedding-service
+      - dataprep-redis-service
+      - retriever
+      - tei-reranking-service
+      - tgi-service
    ports:
      - "8888:8888"
    environment:
@@ -125,7 +113,7 @@ services:
      - RERANK_SERVER_HOST_IP=tei-reranking-service
      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
      - LLM_SERVER_HOST_IP=tgi-service
-      - LLM_SERVER_PORT=80
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
      - LLM_MODEL=${LLM_MODEL_ID}
      - LOGFLAG=${LOGFLAG}
    ipc: host
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
@@ -25,12 +25,6 @@ services:
      INDEX_NAME: ${INDEX_NAME}
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
-      interval: 10s
-      timeout: 5s
-      retries: 50
-    restart: unless-stopped
  tei-embedding-service:
    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
    container_name: tei-embedding-server
@@ -84,16 +78,11 @@ services:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-xeon-backend-server
    depends_on:
-      redis-vector-db:
-        condition: service_started
-      tei-embedding-service:
-        condition: service_started
-      dataprep-redis-service:
-        condition: service_healthy
-      retriever:
-        condition: service_started
-      vllm-service:
-        condition: service_started
+      - redis-vector-db
+      - tei-embedding-service
+      - dataprep-redis-service
+      - retriever
+      - vllm-service
    ports:
      - "8888:8888"
    environment:
@@ -105,7 +94,7 @@ services:
      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-80}
      - RETRIEVER_SERVICE_HOST_IP=retriever
      - LLM_SERVER_HOST_IP=vllm-service
-      - LLM_SERVER_PORT=80
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
      - LLM_MODEL=${LLM_MODEL_ID}
      - LOGFLAG=${LOGFLAG}
      - CHATQNA_TYPE=${CHATQNA_TYPE:-CHATQNA_NO_RERANK}
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -176,7 +176,7 @@ This deployment may allocate more Gaudi resources to the tgi-service to optimize

 ### compose_faqgen.yaml - FAQ generation Deployment

-The FAQs (frequently asked questions and answers) generation Deployment will generate FAQs instead of normally text generation. It adds a new microservice called `llm-faqgen`, which is a microservice that interacts with the TGI/vLLM LLM server to generate FAQs from input text.
+The FAQs(frequently asked questions and answers) generation Deployment will generate FAQs instead of normally text generation. It add a new microservice called `llm-faqgen`, which is a microservice that interacts with the TGI/vLLM LLM server to generate FAQs from input text.

 The TGI (Text Generation Inference) deployment and the default deployment differ primarily in their service configurations and specific focus on handling large language models (LLMs). The TGI deployment includes a unique `tgi-service`, which utilizes the `ghcr.io/huggingface/tgi-gaudi:2.3.1` image and is specifically configured to run on Gaudi hardware. This service is designed to handle LLM tasks with optimizations such as `ENABLE_HPU_GRAPH` and `USE_FLASH_ATTENTION`. The `chatqna-gaudi-backend-server` in the TGI deployment depends on the `tgi-service`, whereas in the default deployment, it relies on the `vllm-service`.

@@ -214,13 +214,13 @@ This setup might allow for more Gaudi devices to be dedicated to the `vllm-servi

 ### compose_guardrails.yaml - Guardrails Deployment

-The _compose_guardrails.yaml_ Docker Compose file introduces enhancements over the default deployment by incorporating additional services focused on safety and ChatQnA response control. Notably, it includes the `vllm-guardrails-service` and `guardrails` services. The `vllm-guardrails-service` uses the `opea/vllm-gaudi:latest` image and is configured to run on Gaudi hardware, providing functionality to manage input constraints and ensure safe operations within defined limits. The guardrails service, using the `opea/guardrails:latest` image, acts as a safety layer that interfaces with the `vllm-guardrails-service` to enforce safety protocols and manage interactions with the large language model (LLM). This backend server now depends on the `vllm-guardrails-service` and `guardrails`, alongside existing dependencies like `redis-vector-db`, `tei-embedding-service`, `retriever`, `tei-reranking-service`, and `vllm-service`. The environment configurations for the backend are also updated to include settings for the guardrail services.
+The _compose_guardrails.yaml_ Docker Compose file introduces enhancements over the default deployment by incorporating additional services focused on safety and ChatQnA response control. Notably, it includes the `tgi-guardrails-service` and `guardrails` services. The `tgi-guardrails-service` uses the `ghcr.io/huggingface/tgi-gaudi:2.3.1` image and is configured to run on Gaudi hardware, providing functionality to manage input constraints and ensure safe operations within defined limits. The guardrails service, using the `opea/guardrails:latest` image, acts as a safety layer that interfaces with the `tgi-guardrails-service` to enforce safety protocols and manage interactions with the large language model (LLM). This backend server now depends on the `tgi-guardrails-service` and `guardrails`, alongside existing dependencies like `redis-vector-db`, `tei-embedding-service`, `retriever`, `tei-reranking-service`, and `vllm-service`. The environment configurations for the backend are also updated to include settings for the guardrail services.

 | Service Name                 | Image Name                                            | Gaudi Specific | Uses LLM |
 | ---------------------------- | ----------------------------------------------------- | -------------- | -------- |
 | redis-vector-db              | redis/redis-stack:7.2.0-v9                            | No             | No       |
 | dataprep-redis-service       | opea/dataprep:latest                                  | No             | No       |
-| _vllm-guardrails-service_    | opea/vllm-gaudi:latest                                | 1 card         | Yes      |
+| _tgi-guardrails-service_     | ghcr.io/huggingface/tgi-gaudi:2.3.1                   | 1 card         | Yes      |
 | _guardrails_                 | opea/guardrails:latest                                | No             | No       |
 | tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.6 | No             | No       |
 | retriever                    | opea/retriever:latest                                 | No             | No       |
@@ -230,7 +230,7 @@ The _compose_guardrails.yaml_ Docker Compose file introduces enhancements over t
 | chatqna-gaudi-ui-server      | opea/chatqna-ui:latest                                | No             | No       |
 | chatqna-gaudi-nginx-server   | opea/nginx:latest                                     | No             | No       |

-The deployment with guardrails introduces additional Gaudi-specific services, such as the `vllm-guardrails-service`, which necessitates careful consideration of Gaudi allocation. This deployment aims to balance safety and performance, potentially requiring a strategic distribution of Gaudi devices between the guardrail services and the LLM tasks to maintain both operational safety and efficiency.
+The deployment with guardrails introduces additional Gaudi-specific services, such as the `tgi-guardrails-service`, which necessitates careful consideration of Gaudi allocation. This deployment aims to balance safety and performance, potentially requiring a strategic distribution of Gaudi devices between the guardrail services and the LLM tasks to maintain both operational safety and efficiency.

 ### Telemetry Enablement - compose.telemetry.yaml and compose_tgi.telemetry.yaml

@@ -284,19 +284,15 @@ ChatQnA now supports running the latest DeepSeek models, including [deepseek-ai/

 ### tei-embedding-service & tei-reranking-service

-The `ghcr.io/huggingface/text-embeddings-inference:cpu-1.6` image supporting `tei-embedding-service` and `tei-reranking-service` depends on the `EMBEDDING_MODEL_ID` or `RERANK_MODEL_ID` environment variables respectively to specify the embedding model and reranking model used for converting text into vector representations and rankings. This choice impacts the quality and relevance of the embeddings rerankings for various applications. Unlike the `vllm-service`, the `tei-embedding-service` and `tei-reranking-service` each typically acquires only one Gaudi device and does not use the `NUM_CARDS` parameter; embedding and reranking tasks generally do not require extensive parallel processing and one Gaudi per service is appropriate. The list of [supported embedding and reranking models](https://github.com/huggingface/tei-gaudi?tab=readme-ov-file#supported-models) can be found at the [huggingface/tei-gaudi](https://github.com/huggingface/tei-gaudi?tab=readme-ov-file#supported-models) website.
+The `ghcr.io/huggingface/text-embeddings-inference:cpu-1.6` image supporting `tei-embedding-service` and `tei-reranking-service` depends on the `EMBEDDING_MODEL_ID` or `RERANK_MODEL_ID` environment variables respectively to specify the embedding model and reranking model used for converting text into vector representations and rankings. This choice impacts the quality and relevance of the embeddings rerankings for various applications. Unlike the `vllm-service`, the `tei-embedding-service` and `tei-reranking-service` each typically acquires only one Gaudi device and does not use the `NUM_CARDS` parameter; embedding and reranking tasks generally do not require extensive parallel processing and one Gaudi per service is appropriate. The list of [supported embedding and reranking models](https://github.com/huggingface/tei-gaudi?tab=readme-ov-file#supported-models) can be found at the the [huggingface/tei-gaudi](https://github.com/huggingface/tei-gaudi?tab=readme-ov-file#supported-models) website.

-### tgi-guardrails-service
+### tgi-gaurdrails-service

 The `tgi-guardrails-service` uses the `GUARDRAILS_MODEL_ID` parameter to select a [supported model](https://github.com/huggingface/tgi-gaudi?tab=readme-ov-file#tested-models-and-configurations) for the associated `ghcr.io/huggingface/tgi-gaudi:2.3.1` image. Like the `tei-embedding-service` and `tei-reranking-service` services, it doesn't use the `NUM_CARDS` parameter.

-### vllm-guardrails-service
-
-The `vllm-guardrails-service` uses the `GUARDRAILS_MODEL_ID` parameter to select a [supported model](https://docs.vllm.ai/en/latest/models/supported_models.html) for the associated `opea/vllm-gaudi:latest` image. It uses the `NUM_CARDS` parameter.
-
 ## Conclusion

-In examining the various services and configurations across different deployments, developers should gain a comprehensive understanding of how each component contributes to the overall functionality and performance of a ChatQnA pipeline on an Intel® Gaudi® platform. Key services such as the `vllm-service`, `tei-embedding-service`, `tei-reranking-service`, `tgi-guardrails-service`and `vllm-guardrails-service` each consume Gaudi accelerators, leveraging specific models and hardware resources to optimize their respective tasks. The `LLM_MODEL_ID`, `EMBEDDING_MODEL_ID`, `RERANK_MODEL_ID`, and `GUARDRAILS_MODEL_ID` parameters specify the models used, directly impacting the quality and effectiveness of language processing, embedding, reranking, and safety operations.
+In examining the various services and configurations across different deployments, developers should gain a comprehensive understanding of how each component contributes to the overall functionality and performance of a ChatQnA pipeline on an Intel® Gaudi® platform. Key services such as the `vllm-service`, `tei-embedding-service`, `tei-reranking-service`, and `tgi-guardrails-service` each consume Gaudi accelerators, leveraging specific models and hardware resources to optimize their respective tasks. The `LLM_MODEL_ID`, `EMBEDDING_MODEL_ID`, `RERANK_MODEL_ID`, and `GUARDRAILS_MODEL_ID` parameters specify the models used, directly impacting the quality and effectiveness of language processing, embedding, reranking, and safety operations.

 The allocation of Gaudi devices, affected by the Gaudi dependent services and the `NUM_CARDS` parameter supporting the `vllm-service` or `tgi-service`, determines where computational power is utilized to enhance performance.

--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -32,12 +32,6 @@ services:
      INDEX_NAME: ${INDEX_NAME}
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
-      interval: 10s
-      timeout: 5s
-      retries: 50
-    restart: unless-stopped
  tei-embedding-service:
    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
    container_name: tei-embedding-gaudi-server
@@ -108,7 +102,7 @@ services:
      NUM_CARDS: ${NUM_CARDS}
      VLLM_TORCH_PROFILER_DIR: "/mnt"
    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
+      test: ["CMD-SHELL", "curl -f http://$host_ip:8007/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
@@ -116,23 +110,16 @@ services:
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq-len-to-capture 2048
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
  chatqna-gaudi-backend-server:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-gaudi-backend-server
    depends_on:
-      redis-vector-db:
-        condition: service_started
-      tei-embedding-service:
-        condition: service_started
-      dataprep-redis-service:
-        condition: service_healthy
-      retriever:
-        condition: service_started
-      tei-reranking-service:
-        condition: service_started
-      vllm-service:
-        condition: service_healthy
+      - redis-vector-db
+      - tei-embedding-service
+      - retriever
+      - tei-reranking-service
+      - vllm-service
    ports:
      - "8888:8888"
    environment:
@@ -146,7 +133,7 @@ services:
      - RERANK_SERVER_HOST_IP=tei-reranking-service
      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
      - LLM_SERVER_HOST_IP=vllm-service
-      - LLM_SERVER_PORT=80
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
      - LLM_MODEL=${LLM_MODEL_ID}
      - LOGFLAG=${LOGFLAG}
    ipc: host
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_faqgen.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_faqgen.yaml
@@ -26,12 +26,6 @@ services:
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      LOGFLAG: ${LOGFLAG}
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
-      interval: 10s
-      timeout: 5s
-      retries: 50
-    restart: unless-stopped
  tei-embedding-service:
    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
    container_name: tei-embedding-gaudi-server
@@ -106,7 +100,7 @@ services:
      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
      VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
@@ -114,7 +108,7 @@ services:
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq-len-to-capture 2048
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
  llm-faqgen:
    image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
    container_name: llm-faqgen-server
@@ -138,20 +132,12 @@ services:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-gaudi-backend-server
    depends_on:
-      redis-vector-db:
-        condition: service_started
-      tei-embedding-service:
-        condition: service_started
-      dataprep-redis-service:
-        condition: service_healthy
-      retriever:
-        condition: service_started
-      tei-reranking-service:
-        condition: service_started
-      vllm-service:
-        condition: service_healthy
-      llm-faqgen:
-        condition: service_started
+      - redis-vector-db
+      - tei-embedding-service
+      - retriever
+      - tei-reranking-service
+      - vllm-service
+      - llm-faqgen
    ports:
      - ${CHATQNA_BACKEND_PORT:-8888}:8888
    environment:
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
lkk	9d1e01bf61	Merge branch 'main' into replace_agent_ui	2025-04-07 10:05:35 +08:00
lkk12014402	36461d7303	update ui style.	2025-04-04 07:41:56 +00:00
lkk12014402	c7bec31873	patch openwebui for opea agent.	2025-04-03 15:02:06 +00:00