increase timeout 10s to 60s

Signed-off-by: ZePan110 <ze.pan@intel.com>
only test
2025-04-03 22:23:42 +08:00 · 2025-04-03 00:59:49 +08:00 · 2025-03-27 09:40:22 +08:00 · 2025-03-26 15:19:38 +08:00 · 2025-03-25 08:34:59 -07:00 · 2025-03-25 14:35:12 +08:00
537 changed files with 49366 additions and 11313 deletions
--- a/.github/code_spell_ignore.txt
+++ b/.github/code_spell_ignore.txt
@@ -1,2 +1,3 @@
 ModelIn
 modelin
+pressEnter
--- a/.github/workflows/_build_comps_base_image.yml
+++ b/.github/workflows/_build_comps_base_image.yml
@@ -0,0 +1,65 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+name: Build Comps Base Image
+permissions: read-all
+on:
+  workflow_call:
+    inputs:
+      node:
+        required: true
+        type: string
+      build:
+        default: true
+        required: false
+        type: boolean
+      tag:
+        default: "latest"
+        required: false
+        type: string
+      opea_branch:
+        default: "main"
+        required: false
+        type: string
+      inject_commit:
+        default: false
+        required: false
+        type: boolean
+
+jobs:
+  pre-build-image-check:
+    runs-on: ubuntu-latest
+    outputs:
+      should_skip: ${{ steps.check-skip.outputs.should_skip }}
+    steps:
+      - name: Check if job should be skipped
+        id: check-skip
+        run: |
+          should_skip=false
+          if [[ "${{ inputs.node }}" == "gaudi3" || "${{ inputs.node }}" == "rocm" || "${{ inputs.node }}" == "arc" ]]; then
+            should_skip=true
+          fi
+          echo "should_skip=$should_skip"
+          echo "should_skip=$should_skip" >> $GITHUB_OUTPUT
+
+  build-images:
+    needs: [ pre-build-image-check ]
+    if: ${{ needs.pre-build-image-check.outputs.should_skip == 'false' && fromJSON(inputs.build) }}
+    runs-on: "docker-build-${{ inputs.node }}"
+    steps:
+      - name: Clean Up Working Directory
+        run: sudo rm -rf ${{github.workspace}}/*
+
+      - name: Clone Required Repo
+        run: |
+          git clone --depth 1 --branch ${{ inputs.opea_branch }} https://github.com/opea-project/GenAIComps.git
+          cd GenAIComps && git rev-parse HEAD && cd ../ && ls -l
+
+      - name: Build Image
+        uses: opea-project/validation/actions/image-build@main
+        with:
+          work_dir: ${{ github.workspace }}/GenAIComps
+          docker_compose_path: ${{ github.workspace }}/GenAIComps/.github/workflows/docker/compose/base-compose.yaml
+          registry: ${OPEA_IMAGE_REPO}opea
+          inject_commit: ${{ inputs.inject_commit }}
+          tag: ${{ inputs.tag }}
--- a/.github/workflows/_build_image.yml
+++ b/.github/workflows/_build_image.yml
@@ -0,0 +1,103 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+name: Build Images
+permissions: read-all
+on:
+  workflow_call:
+    inputs:
+      node:
+        required: true
+        type: string
+      build:
+        default: true
+        required: false
+        type: boolean
+      example:
+        required: true
+        type: string
+      services:
+        default: ""
+        required: false
+        type: string
+      tag:
+        default: "latest"
+        required: false
+        type: string
+      opea_branch:
+        default: "main"
+        required: false
+        type: string
+      inject_commit:
+        default: false
+        required: false
+        type: boolean
+
+jobs:
+  pre-build-image-check:
+    runs-on: ubuntu-latest
+    outputs:
+      should_skip: ${{ steps.check-skip.outputs.should_skip }}
+    steps:
+      - name: Check if job should be skipped
+        id: check-skip
+        run: |
+          should_skip=false
+          if [[ "${{ inputs.node }}" == "gaudi3" || "${{ inputs.node }}" == "rocm" || "${{ inputs.node }}" == "arc" ]]; then
+            should_skip=true
+          fi
+          echo "should_skip=$should_skip"
+          echo "should_skip=$should_skip" >> $GITHUB_OUTPUT
+
+  build-images:
+    needs: [ pre-build-image-check ]
+    if: ${{ needs.pre-build-image-check.outputs.should_skip == 'false' && fromJSON(inputs.build) }}
+    runs-on: "docker-build-${{ inputs.node }}"
+    steps:
+      - name: Clean Up Working Directory
+        run: sudo rm -rf ${{github.workspace}}/*
+
+      - name: Get Checkout Ref
+        run: |
+          if [ "${{ github.event_name }}" == "pull_request" ] || [ "${{ github.event_name }}" == "pull_request_target" ]; then
+            echo "CHECKOUT_REF=refs/pull/${{ github.event.number }}/merge" >> $GITHUB_ENV
+          else
+            echo "CHECKOUT_REF=${{ github.ref }}" >> $GITHUB_ENV
+          fi
+
+      - name: Checkout out GenAIExamples
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ env.CHECKOUT_REF }}
+          fetch-depth: 0
+
+      - name: Clone Required Repo
+        run: |
+          cd ${{ github.workspace }}/${{ inputs.example }}/docker_image_build
+          docker_compose_path=${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml
+          if [[ $(grep -c "vllm:" ${docker_compose_path}) != 0 ]]; then
+              git clone https://github.com/vllm-project/vllm.git && cd vllm
+              # Get the latest tag
+              VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+              echo "Check out vLLM tag ${VLLM_VER}"
+              git checkout ${VLLM_VER} &> /dev/null && cd ../
+          fi
+          if [[ $(grep -c "vllm-gaudi:" ${docker_compose_path}) != 0 ]]; then
+              git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork
+              # Get the latest tag
+              VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+              echo "Check out vLLM tag ${VLLM_VER}"
+              git checkout ${VLLM_VER} &> /dev/null && cd ../
+          fi
+          git clone --depth 1 --branch ${{ inputs.opea_branch }} https://github.com/opea-project/GenAIComps.git
+          cd GenAIComps && git rev-parse HEAD && cd ../
+
+      - name: Build Image
+        uses: opea-project/validation/actions/image-build@main
+        with:
+          work_dir: ${{ github.workspace }}/${{ inputs.example }}/docker_image_build
+          docker_compose_path: ${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml
+          service_list: ${{ inputs.services }}
+          registry: ${OPEA_IMAGE_REPO}opea
+          inject_commit: ${{ inputs.inject_commit }}
+          tag: ${{ inputs.tag }}
--- a/.github/workflows/_example-workflow.yml
+++ b/.github/workflows/_example-workflow.yml
@@ -43,68 +43,39 @@ on:
      inject_commit:
        default: false
        required: false
-        type: string
+        type: boolean
+      use_model_cache:
+        default: false
+        required: false
+        type: boolean

 jobs:
 ####################################################################################################
 # Image Build
 ####################################################################################################
  build-images:
-    runs-on: "docker-build-${{ inputs.node }}"
-    steps:
-      - name: Clean Up Working Directory
-        run: sudo rm -rf ${{github.workspace}}/*
-
-      - name: Get Checkout Ref
-        run: |
-          if [ "${{ github.event_name }}" == "pull_request" ] || [ "${{ github.event_name }}" == "pull_request_target" ]; then
-            echo "CHECKOUT_REF=refs/pull/${{ github.event.number }}/merge" >> $GITHUB_ENV
-          else
-            echo "CHECKOUT_REF=${{ github.ref }}" >> $GITHUB_ENV
-          fi
-
-      - name: Checkout out GenAIExamples
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ env.CHECKOUT_REF }}
-          fetch-depth: 0
-
-      - name: Clone Required Repo
-        run: |
-          cd ${{ github.workspace }}/${{ inputs.example }}/docker_image_build
-          docker_compose_path=${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml
-          if [[ $(grep -c "vllm:" ${docker_compose_path}) != 0 ]]; then
-              git clone --depth 1 https://github.com/vllm-project/vllm.git
-              cd vllm && git rev-parse HEAD && cd ../
-          fi
-          if [[ $(grep -c "vllm-gaudi:" ${docker_compose_path}) != 0 ]]; then
-               git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
-          fi
-          git clone --depth 1 --branch ${{ inputs.opea_branch }} https://github.com/opea-project/GenAIComps.git
-          cd GenAIComps && git rev-parse HEAD && cd ../
-
-      - name: Build Image
-        if: ${{ fromJSON(inputs.build) }}
-        uses: opea-project/validation/actions/image-build@main
-        with:
-          work_dir: ${{ github.workspace }}/${{ inputs.example }}/docker_image_build
-          docker_compose_path: ${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml
-          service_list: ${{ inputs.services }}
-          registry: ${OPEA_IMAGE_REPO}opea
-          inject_commit: ${{ inputs.inject_commit }}
-          tag: ${{ inputs.tag }}
+    uses: ./.github/workflows/_build_image.yml
+    with:
+      node: ${{ inputs.node }}
+      build: ${{ fromJSON(inputs.build) }}
+      example: ${{ inputs.example }}
+      services: ${{ inputs.services }}
+      tag: ${{ inputs.tag }}
+      opea_branch: ${{ inputs.opea_branch }}
+      inject_commit: ${{ inputs.inject_commit }}

 ####################################################################################################
 # Docker Compose Test
 ####################################################################################################
  test-example-compose:
    needs: [build-images]
-    if: ${{ fromJSON(inputs.test_compose) }}
+    if: ${{ inputs.test_compose }}
    uses: ./.github/workflows/_run-docker-compose.yml
    with:
      tag: ${{ inputs.tag }}
      example: ${{ inputs.example }}
      hardware: ${{ inputs.node }}
+      use_model_cache: ${{ inputs.use_model_cache }}
    secrets: inherit


@@ -126,7 +97,7 @@ jobs:
 ####################################################################################################
  test-gmc-pipeline:
    needs: [build-images]
-    if: ${{ fromJSON(inputs.test_gmc) }}
+    if: false # ${{ fromJSON(inputs.test_gmc) }}
    uses: ./.github/workflows/_gmc-e2e.yml
    with:
      example: ${{ inputs.example }}
--- a/.github/workflows/_helm-e2e.yml
+++ b/.github/workflows/_helm-e2e.yml
@@ -97,6 +97,7 @@ jobs:

  helm-test:
    needs: [get-test-case]
+    if: ${{ needs.get-test-case.outputs.value_files != '[]' }}
    strategy:
      matrix:
        value_file: ${{ fromJSON(needs.get-test-case.outputs.value_files) }}
--- a/.github/workflows/_run-docker-compose.yml
+++ b/.github/workflows/_run-docker-compose.yml
@@ -28,6 +28,10 @@ on:
        required: false
        type: string
        default: ""
+      use_model_cache:
+        required: false
+        type: boolean
+        default: false
 jobs:
  get-test-case:
    runs-on: ubuntu-latest
@@ -60,9 +64,14 @@ jobs:
          cd ${{ github.workspace }}/${{ inputs.example }}/tests
          run_test_cases=""

-          default_test_case=$(find . -type f -name "test_compose_on_${{ inputs.hardware }}.sh" | cut -d/ -f2)
+          if [ "${{ inputs.hardware }}" == "gaudi2" ] || [ "${{ inputs.hardware }}" == "gaudi3" ]; then
+            hardware="gaudi"
+          else
+            hardware="${{ inputs.hardware }}"
+          fi
+          default_test_case=$(find . -type f -name "test_compose_on_$hardware.sh" | cut -d/ -f2)
          if [ "$default_test_case" ]; then run_test_cases="$default_test_case"; fi
-          other_test_cases=$(find . -type f -name "test_compose_*_on_${{ inputs.hardware }}.sh" | cut -d/ -f2)
+          other_test_cases=$(find . -type f -name "test_compose_*_on_$hardware.sh" | cut -d/ -f2)
          echo "default_test_case=$default_test_case"
          echo "other_test_cases=$other_test_cases"

@@ -85,12 +94,17 @@ jobs:
              fi
          done

+          if [ -z "$run_test_cases" ] && [[ $(printf '%s\n' "${changed_files[@]}" | grep ${{ inputs.example }} | grep /tests/) ]]; then
+              run_test_cases=$other_test_cases
+          fi
+
          test_cases=$(echo $run_test_cases | tr ' ' '\n' | sort -u | jq -R '.' | jq -sc '.')
          echo "test_cases=$test_cases"
          echo "test_cases=$test_cases" >> $GITHUB_OUTPUT

  compose-test:
    needs: [get-test-case]
+    if: ${{ needs.get-test-case.outputs.test_cases != '[""]' }}
    strategy:
      matrix:
        test_case: ${{ fromJSON(needs.get-test-case.outputs.test_cases) }}
@@ -126,6 +140,7 @@ jobs:
        shell: bash
        env:
          HUGGINGFACEHUB_API_TOKEN: ${{ secrets.HUGGINGFACEHUB_API_TOKEN }}
+          HF_TOKEN: ${{ secrets.HUGGINGFACEHUB_API_TOKEN }}
          GOOGLE_CSE_ID: ${{ secrets.GOOGLE_CSE_ID }}
          GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
          PINECONE_KEY: ${{ secrets.PINECONE_KEY }}
@@ -138,10 +153,19 @@ jobs:
          example: ${{ inputs.example }}
          hardware: ${{ inputs.hardware }}
          test_case: ${{ matrix.test_case }}
+          use_model_cache: ${{ inputs.use_model_cache }}
        run: |
          cd ${{ github.workspace }}/$example/tests
          if [[ "$IMAGE_REPO" == "" ]]; then export IMAGE_REPO="${OPEA_IMAGE_REPO}opea"; fi
-          if [ -f ${test_case} ]; then timeout 30m bash ${test_case}; else echo "Test script {${test_case}} not found, skip test!"; fi
+          if [[ "$use_model_cache" == "true" ]]; then
+            if [ -d "/data2/hf_model" ]; then
+              export model_cache="/data2/hf_model"
+            else
+              echo "Model cache directory /data2/hf_model does not exist"
+              export model_cache="~/.cache/huggingface/hub"
+            fi
+          fi
+          if [ -f "${test_case}" ]; then timeout 30m bash "${test_case}"; else echo "Test script {${test_case}} not found, skip test!"; fi

      - name: Clean up container after test
        shell: bash
--- a/.github/workflows/daily_check_issue_and_pr.yml
+++ b/.github/workflows/daily_check_issue_and_pr.yml
@@ -0,0 +1,28 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+name: Check stale issue and pr
+
+on:
+  schedule:
+    - cron: "30 22 * * *"
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v9
+        with:
+          days-before-issue-stale: 30
+          days-before-pr-stale: 30
+          days-before-issue-close: 7
+          days-before-pr-close: 7
+          stale-issue-message: "This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 7 days."
+          stale-pr-message: "This PR is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 7 days."
+          close-issue-message: "This issue was closed because it has been stalled for 7 days with no activity."
+          close-pr-message: "This PR was closed because it has been stalled for 7 days with no activity."
+          repo-token: ${{ secrets.ACTION_TOKEN }}
+          start-date: "2025-03-01T00:00:00Z"
--- a/.github/workflows/manual-docker-publish.yml
+++ b/.github/workflows/manual-docker-publish.yml
@@ -41,9 +41,11 @@ jobs:

  publish:
    needs: [get-image-list]
+    if: ${{ needs.get-image-list.outputs.matrix != '' }}
    strategy:
      matrix:
        image: ${{ fromJSON(needs.get-image-list.outputs.matrix) }}
+      fail-fast: false
    runs-on: "docker-build-${{ inputs.node }}"
    steps:
      - uses: docker/login-action@v3.2.0
--- a/.github/workflows/manual-docker-scan.yml
+++ b/.github/workflows/manual-docker-scan.yml
@@ -12,7 +12,7 @@ on:
        type: string
      examples:
        default: ""
-        description: 'List of examples to publish "AgentQnA,AudioQnA,ChatQnA,CodeGen,CodeTrans,DocIndexRetriever,DocSum,FaqGen,InstructionTuning,MultimodalQnA,ProductivitySuite,RerankFinetuning,SearchQnA,Translation,VideoQnA,VisualQnA"'
+        description: 'List of examples to publish "AgentQnA,AudioQnA,ChatQnA,CodeGen,CodeTrans,DocIndexRetriever,DocSum,InstructionTuning,MultimodalQnA,ProductivitySuite,RerankFinetuning,SearchQnA,Translation,VideoQnA,VisualQnA"'
        required: false
        type: string
      images:
@@ -47,6 +47,7 @@ jobs:
  scan-docker:
    needs: get-image-list
    runs-on: "docker-build-${{ inputs.node }}"
+    if: ${{ needs.get-image-list.outputs.matrix != '' }}
    strategy:
      matrix:
        image: ${{ fromJson(needs.get-image-list.outputs.matrix) }}
--- a/.github/workflows/manual-example-workflow.yml
+++ b/.github/workflows/manual-example-workflow.yml
@@ -7,7 +7,7 @@ on:
    inputs:
      nodes:
        default: "gaudi,xeon"
-        description: "Hardware to run test"
+        description: "Hardware to run test gaudi,gaudi3,xeon,rocm,arc"
        required: true
        type: string
      examples:
@@ -20,11 +20,6 @@ on:
        description: "Tag to apply to images"
        required: true
        type: string
-      deploy_gmc:
-        default: false
-        description: 'Whether to deploy gmc'
-        required: true
-        type: boolean
      build:
        default: true
        description: 'Build test required images for Examples'
@@ -40,11 +35,6 @@ on:
        description: 'Test examples with helm charts'
        required: false
        type: boolean
-      test_gmc:
-        default: false
-        description: 'Test examples with gmc'
-        required: false
-        type: boolean
      opea_branch:
        default: "main"
        description: 'OPEA branch for image build'
@@ -52,9 +42,14 @@ on:
        type: string
      inject_commit:
        default: false
-        description: "inject commit to docker images true or false"
+        description: "inject commit to docker images"
        required: false
-        type: string
+        type: boolean
+      use_model_cache:
+        default: false
+        description: "use model cache"
+        required: false
+        type: boolean

 permissions: read-all
 jobs:
@@ -74,23 +69,20 @@ jobs:
        nodes_json=$(printf '%s\n' "${nodes[@]}" | sort -u | jq -R '.' | jq -sc '.')
        echo "nodes=$nodes_json" >> $GITHUB_OUTPUT

-  build-deploy-gmc:
+  build-comps-base:
    needs: [get-test-matrix]
-    if: ${{ fromJSON(inputs.deploy_gmc) }}
    strategy:
      matrix:
        node: ${{ fromJson(needs.get-test-matrix.outputs.nodes) }}
-      fail-fast: false
-    uses: ./.github/workflows/_gmc-workflow.yml
+    uses: ./.github/workflows/_build_comps_base_image.yml
    with:
      node: ${{ matrix.node }}
+      build: ${{ fromJSON(inputs.build) }}
      tag: ${{ inputs.tag }}
      opea_branch: ${{ inputs.opea_branch }}
-    secrets: inherit

  run-examples:
-    needs: [get-test-matrix, build-deploy-gmc]
-    if: always()
+    needs: [get-test-matrix, build-comps-base]
    strategy:
      matrix:
        example: ${{ fromJson(needs.get-test-matrix.outputs.examples) }}
@@ -104,7 +96,7 @@ jobs:
      build: ${{ fromJSON(inputs.build) }}
      test_compose: ${{ fromJSON(inputs.test_compose) }}
      test_helmchart: ${{ fromJSON(inputs.test_helmchart) }}
-      test_gmc: ${{ fromJSON(inputs.test_gmc) }}
      opea_branch: ${{ inputs.opea_branch }}
      inject_commit: ${{ inputs.inject_commit }}
+      use_model_cache: ${{ inputs.use_model_cache }}
    secrets: inherit
--- a/.github/workflows/manual-freeze-tag.yml
+++ b/.github/workflows/manual-freeze-tag.yml
@@ -25,9 +25,9 @@ jobs:

      - name: Set up Git
        run: |
-          git config --global user.name "NeuralChatBot"
-          git config --global user.email "grp_neural_chat_bot@intel.com"
-          git remote set-url origin https://NeuralChatBot:"${{ secrets.ACTION_TOKEN }}"@github.com/opea-project/GenAIExamples.git
+          git config --global user.name "CICD-at-OPEA"
+          git config --global user.email "CICD@opea.dev"
+          git remote set-url origin https://CICD-at-OPEA:"${{ secrets.ACTION_TOKEN }}"@github.com/opea-project/GenAIExamples.git

      - name: Run script
        run: |
--- a/.github/workflows/manual-image-build.yml
+++ b/.github/workflows/manual-image-build.yml
@@ -32,9 +32,9 @@ on:
        type: string
      inject_commit:
        default: false
-        description: "inject commit to docker images true or false"
+        description: "inject commit to docker images"
        required: false
-        type: string
+        type: boolean

 jobs:
  get-test-matrix:
@@ -51,6 +51,7 @@ jobs:

  image-build:
    needs: get-test-matrix
+    if: ${{ needs.get-test-matrix.outputs.nodes != '' }}
    strategy:
      matrix:
        node: ${{ fromJson(needs.get-test-matrix.outputs.nodes) }}
--- a/.github/workflows/manual-reset-local-registry.yml
+++ b/.github/workflows/manual-reset-local-registry.yml
@@ -33,6 +33,7 @@ jobs:

  clean-up:
    needs: get-build-matrix
+    if: ${{ needs.get-image-list.outputs.matrix != '' }}
    strategy:
      matrix:
        node: ${{ fromJson(needs.get-build-matrix.outputs.nodes) }}
@@ -47,6 +48,7 @@ jobs:

  build:
    needs: [get-build-matrix, clean-up]
+    if: ${{ needs.get-image-list.outputs.matrix != '' }}
    strategy:
      matrix:
        example: ${{ fromJson(needs.get-build-matrix.outputs.examples) }}
--- a/.github/workflows/nightly-docker-build-publish.yml
+++ b/.github/workflows/nightly-docker-build-publish.yml
@@ -32,8 +32,15 @@ jobs:
          echo "TAG=$TAG" >> $GITHUB_OUTPUT
          echo "PUBLISH_TAGS=$PUBLISH_TAGS" >> $GITHUB_OUTPUT

+  build-comps-base:
+    needs: [get-build-matrix]
+    uses: ./.github/workflows/_build_comps_base_image.yml
+    with:
+      node: gaudi
+
  build-and-test:
    needs: get-build-matrix
+    if: ${{ needs.get-build-matrix.outputs.examples_json != '' }}
    strategy:
      matrix:
        example: ${{ fromJSON(needs.get-build-matrix.outputs.examples_json) }}
@@ -43,6 +50,7 @@ jobs:
      node: gaudi
      example: ${{ matrix.example }}
      test_compose: true
+      inject_commit: true
    secrets: inherit

  get-image-list:
@@ -53,9 +61,11 @@ jobs:

  publish:
    needs: [get-build-matrix, get-image-list, build-and-test]
+    if: ${{ needs.get-image-list.outputs.matrix != '' }}
    strategy:
      matrix:
        image: ${{ fromJSON(needs.get-image-list.outputs.matrix) }}
+      fail-fast: false
    runs-on: "docker-build-gaudi"
    steps:
      - uses: docker/login-action@v3.2.0
--- a/.github/workflows/pr-chart-e2e.yml
+++ b/.github/workflows/pr-chart-e2e.yml
@@ -65,7 +65,7 @@ jobs:

  helm-chart-test:
    needs: [job1]
-    if: always() && ${{ needs.job1.outputs.run_matrix.example.length > 0 }}
+    if: always() && ${{ fromJSON(needs.job1.outputs.run_matrix).length != 0 }}
    uses: ./.github/workflows/_helm-e2e.yml
    strategy:
      matrix: ${{ fromJSON(needs.job1.outputs.run_matrix) }}
--- a/.github/workflows/pr-docker-compose-e2e.yml
+++ b/.github/workflows/pr-docker-compose-e2e.yml
@@ -32,15 +32,16 @@ jobs:

  example-test:
    needs: [get-test-matrix]
+    if: ${{ needs.get-test-matrix.outputs.run_matrix != '' }}
    strategy:
      matrix: ${{ fromJSON(needs.get-test-matrix.outputs.run_matrix) }}
      fail-fast: false
-    if: ${{ !github.event.pull_request.draft }}
    uses: ./.github/workflows/_run-docker-compose.yml
    with:
      registry: "opea"
      tag: "ci"
      example: ${{ matrix.example }}
      hardware: ${{ matrix.hardware }}
+      use_model_cache: true
      diff_excluded_files: '\.github|\.md|\.txt|kubernetes|gmc|assets|benchmark'
    secrets: inherit
--- a/.github/workflows/push-image-build.yml
+++ b/.github/workflows/push-image-build.yml
@@ -24,6 +24,7 @@ jobs:

  image-build:
    needs: job1
+    if: ${{ needs.job1.outputs.run_matrix != '{"include":[]}' }}
    strategy:
      matrix: ${{ fromJSON(needs.job1.outputs.run_matrix) }}
      fail-fast: false
--- a/.github/workflows/push-infra-issue-creation.yml
+++ b/.github/workflows/push-infra-issue-creation.yml
@@ -54,6 +54,6 @@ jobs:

            ${{ env.changed_files }}

-            Please verify if the helm charts and manifests need to be changed accordingly.
+            Please verify if the helm charts need to be changed accordingly.

            > This issue was created automatically by CI.
--- a/.github/workflows/scripts/docker_compose_clean_up.sh
+++ b/.github/workflows/scripts/docker_compose_clean_up.sh
@@ -30,13 +30,20 @@ case "$1" in
        echo "$ports"
        for port in $ports; do
          if [[ $port =~ [a-zA-Z_-] ]]; then
-            port=$(grep -E "export $port=" tests/$test_case | cut -d'=' -f2)
+            echo "Search port value $port from the test case..."
+            port_fix=$(grep -E "export $port=" tests/$test_case | cut -d'=' -f2)
+            if [[ "$port_fix" == "" ]]; then
+              echo "Can't find the port value from the test case, use the default value in yaml..."
+              port_fix=$(yq '.services[].ports[]' $yaml_file | grep $port | cut -d':' -f2 |  grep -o '[0-9a-zA-Z]\+')
+            fi
+            port=$port_fix
          fi
          if [[ $port =~ [0-9] ]]; then
            if [[ $port == 5000 ]]; then
              echo "Error: Port 5000 is used by local docker registry, please DO NOT use it in docker compose deployment!!!"
              exit 1
            fi
+            echo "Check port $port..."
            cid=$(docker ps --filter "publish=${port}" --format "{{.ID}}")
            if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && echo "release $port"; fi
          fi
--- a/.github/workflows/scripts/get_test_matrix.sh
+++ b/.github/workflows/scripts/get_test_matrix.sh
@@ -12,6 +12,7 @@ run_matrix="{\"include\":["

 examples=$(printf '%s\n' "${changed_files[@]}" | grep '/' | cut -d'/' -f1 | sort -u)
 for example in ${examples}; do
+    if [[ ! -d $WORKSPACE/$example ]]; then continue; fi
    cd $WORKSPACE/$example
    if [[ ! $(find . -type f | grep ${test_mode}) ]]; then continue; fi
    cd tests
@@ -26,7 +27,10 @@ for example in ${examples}; do

    run_hardware=""
    if [[ $(printf '%s\n' "${changed_files[@]}" | grep ${example} | cut -d'/' -f2 | grep -E '\.py|Dockerfile*|ui|docker_image_build' ) ]]; then
-        # run test on all hardware if megaservice or ui code change
+        echo "run test on all hardware if megaservice or ui code change..."
+        run_hardware=$hardware_list
+    elif [[ $(printf '%s\n' "${changed_files[@]}" | grep ${example} | grep 'tests'| cut -d'/' -f3 | grep -vE '^test_|^_test' ) ]]; then
+        echo "run test on all hardware if common test scripts change..."
        run_hardware=$hardware_list
    else
        for hardware in ${hardware_list}; do
--- a/.github/workflows/weekly-update-images.yml
+++ b/.github/workflows/weekly-update-images.yml
@@ -1,11 +1,9 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-name: Weekly update base images and 3rd party images
+name: Weekly update 3rd party images

 on:
-  schedule:
-    - cron: "0 0 * * 0"
  workflow_dispatch:

 permissions:
@@ -16,8 +14,8 @@ jobs:
  freeze-images:
    runs-on: ubuntu-latest
    env:
-      USER_NAME: "NeuralChatBot"
-      USER_EMAIL: "grp_neural_chat_bot@intel.com"
+      USER_NAME: "CICD-at-OPEA"
+      USER_EMAIL: "CICD@opea.dev"
      BRANCH_NAME: "update_images_tag"
    steps:
      - name: Checkout repository
--- a/AgentQnA/README.md
+++ b/AgentQnA/README.md
@@ -2,7 +2,7 @@

 ## Overview

-This example showcases a hierarchical multi-agent system for question-answering applications. The architecture diagram is shown below. The supervisor agent interfaces with the user and dispatch tasks to two worker agents to gather information and come up with answers. The worker RAG agent uses the retrieval tool to retrieve relevant documents from the knowledge base (a vector database). The worker SQL agent retrieve relevant data from the SQL database. Although not included in this example, but other tools such as a web search tool or a knowledge graph query tool can be used by the supervisor agent to gather information from additional sources.
+This example showcases a hierarchical multi-agent system for question-answering applications. The architecture diagram below shows a supervisor agent that interfaces with the user and dispatches tasks to two worker agents to gather information and come up with answers. The worker RAG agent uses the retrieval tool to retrieve relevant documents from a knowledge base - a vector database. The worker SQL agent retrieves relevant data from a SQL database. Although not included in this example by default, other tools such as a web search tool or a knowledge graph query tool can be used by the supervisor agent to gather information from additional sources.
 ![Architecture Overview](assets/img/agent_qna_arch.png)

 The AgentQnA example is implemented using the component-level microservices defined in [GenAIComps](https://github.com/opea-project/GenAIComps). The flow chart below shows the information flow between different microservices for this example.
@@ -75,201 +75,161 @@ flowchart LR

 ```

-### Why Agent for question answering?
+### Why should AI Agents be used for question-answering?

-1. Improve relevancy of retrieved context.
-   RAG agent can rephrase user queries, decompose user queries, and iterate to get the most relevant context for answering user's questions. Compared to conventional RAG, RAG agent can significantly improve the correctness and relevancy of the answer.
-2. Expand scope of the agent.
-   The supervisor agent can interact with multiple worker agents that specialize in different domains with different skills (e.g., retrieve documents, write SQL queries, etc.), and thus can answer questions in multiple domains.
-3. Hierarchical multi-agents can improve performance.
-   Expert worker agents, such as RAG agent and SQL agent, can provide high-quality output for different aspects of a complex query, and the supervisor agent can aggregate the information together to provide a comprehensive answer. If we only use one agent and provide all the tools to this single agent, it may get overwhelmed and not able to provide accurate answers.
+1. **Improve relevancy of retrieved context.**
+   RAG agents can rephrase user queries, decompose user queries, and iterate to get the most relevant context for answering a user's question. Compared to conventional RAG, RAG agents significantly improve the correctness and relevancy of the answer because of the iterations it goes through.
+2. **Expand scope of skills.**
+   The supervisor agent interacts with multiple worker agents that specialize in different skills (e.g., retrieve documents, write SQL queries, etc.). Thus, it can answer questions with different methods.
+3. **Hierarchical multi-agents improve performance.**
+   Expert worker agents, such as RAG agents and SQL agents, can provide high-quality output for different aspects of a complex query, and the supervisor agent can aggregate the information to provide a comprehensive answer. If only one agent is used and all tools are provided to this single agent, it can lead to large overhead or not use the best tool to provide accurate answers.

-## Deployment with docker
+## Deploy with docker

-1. Build agent docker image [Optional]
+### 1. Set up environment </br>

-> [!NOTE]
-> the step is optional. The docker images will be automatically pulled when running the docker compose commands. This step is only needed if pulling images failed.
-
-First, clone the opea GenAIComps repo.
+#### First, clone the `GenAIExamples` repo.

 ```
 export WORKDIR=<your-work-directory>
 cd $WORKDIR
-git clone https://github.com/opea-project/GenAIComps.git
+git clone https://github.com/opea-project/GenAIExamples.git
 ```

-Then build the agent docker image. Both the supervisor agent and the worker agent will use the same docker image, but when we launch the two agents we will specify different strategies and register different tools.
+#### Second, set up environment variables.
+
+##### For proxy environments only

 ```
-cd GenAIComps
-docker build -t opea/agent:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/agent/src/Dockerfile .
+export http_proxy="Your_HTTP_Proxy"
+export https_proxy="Your_HTTPs_Proxy"
+# Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+export no_proxy="Your_No_Proxy"
 ```

-2. Set up environment for this example </br>
+##### For using open-source llms

-   First, clone this repo.
+```
+export HUGGINGFACEHUB_API_TOKEN=<your-HF-token>
+export HF_CACHE_DIR=<directory-where-llms-are-downloaded> #so that no need to redownload every time
+```

-   ```
-   export WORKDIR=<your-work-directory>
-   cd $WORKDIR
-   git clone https://github.com/opea-project/GenAIExamples.git
-   ```
+##### [Optional] OPANAI_API_KEY to use OpenAI models

-   Second, set up env vars.
+```
+export OPENAI_API_KEY=<your-openai-key>
+```

-   ```
-   # Example: host_ip="192.168.1.1" or export host_ip="External_Public_IP"
-   export host_ip=$(hostname -I | awk '{print $1}')
-   # if you are in a proxy environment, also set the proxy-related environment variables
-   export http_proxy="Your_HTTP_Proxy"
-   export https_proxy="Your_HTTPs_Proxy"
-   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
-   export no_proxy="Your_No_Proxy"
+#### Third, set up environment variables for the selected hardware using the corresponding `set_env.sh`

-   export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
-   # for using open-source llms
-   export HUGGINGFACEHUB_API_TOKEN=<your-HF-token>
-   export HF_CACHE_DIR=<directory-where-llms-are-downloaded> #so that no need to redownload every time
+##### Gaudi

-   # optional: OPANAI_API_KEY if you want to use OpenAI models
-   export OPENAI_API_KEY=<your-openai-key>
-   ```
+```
+source $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+```

-3. Deploy the retrieval tool (i.e., DocIndexRetriever mega-service)
+##### Xeon

-   First, launch the mega-service.
+```
+source $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon/set_env.sh
+```

-   ```
-   cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool
-   bash launch_retrieval_tool.sh
-   ```
+### 3. Launch the multi-agent system. </br>

-   Then, ingest data into the vector database. Here we provide an example. You can ingest your own data.
+Two options are provided for the `llm_engine` of the agents: 1. open-source LLMs on Gaudi, 2. OpenAI models via API calls.

-   ```
-   bash run_ingest_data.sh
-   ```
+#### Gaudi

-4. Prepare SQL database
-   In this example, we will use the Chinook SQLite database. Run the commands below.
+On Gaudi, `meta-llama/Meta-Llama-3.1-70B-Instruct` will be served using vllm.
+By default, both the RAG agent and SQL agent will be launched to support the React Agent.  
+The React Agent requires the DocIndexRetriever's [`compose.yaml`](../DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml) file, so two `compose.yaml` files need to be run with docker compose to start the multi-agent system.

-   ```
-   # Download data
-   cd $WORKDIR
-   git clone https://github.com/lerocha/chinook-database.git
-   cp chinook-database/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite $WORKDIR/GenAIExamples/AgentQnA/tests/
-   ```
+> **Note**: To enable the web search tool, skip this step and proceed to the "[Optional] Web Search Tool Support" section.

-5. Launch other tools. </br>
-   In this example, we will use some of the mock APIs provided in the Meta CRAG KDD Challenge to demonstrate the benefits of gaining additional context from mock knowledge graphs.
+```bash
+cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/
+docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml up -d
+```

-   ```
-   docker run -d -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
-   ```
+##### [Optional] Web Search Tool Support

-6. Launch multi-agent system. </br>
-   We provide two options for `llm_engine` of the agents: 1. open-source LLMs on Intel Gaudi2, 2. OpenAI models via API calls.
+<details>
+<summary> Instructions </summary>
+A web search tool is supported in this example and can be enabled by running docker compose with the `compose.webtool.yaml` file.  
+The Google Search API is used. Follow the [instructions](https://python.langchain.com/docs/integrations/tools/google_search) to create an API key and enable the Custom Search API on a Google account. The environment variables `GOOGLE_CSE_ID` and `GOOGLE_API_KEY` need to be set.

-   ::::{tab-set}
-   :::{tab-item} Gaudi
-   :sync: Gaudi
+```bash
+cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/
+export GOOGLE_CSE_ID="YOUR_ID"
+export GOOGLE_API_KEY="YOUR_API_KEY"
+docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml -f compose.webtool.yaml up -d
+```

-   On Gaudi2 we will serve `meta-llama/Meta-Llama-3.1-70B-Instruct` using vllm.
+</details>

-   First build vllm-gaudi docker image.
+#### Xeon

-   ```bash
-   cd $WORKDIR
-   git clone https://github.com/vllm-project/vllm.git
-   cd ./vllm
-   git checkout v0.6.6
-   docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
-   ```
+On Xeon, only OpenAI models are supported.
+By default, both the RAG Agent and SQL Agent will be launched to support the React Agent.  
+The React Agent requires the DocIndexRetriever's [`compose.yaml`](../DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml) file, so two `compose yaml` files need to be run with docker compose to start the multi-agent system.

-   Then launch vllm on Gaudi2 with the command below.
+```bash
+export OPENAI_API_KEY=<your-openai-key>
+cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon
+docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose_openai.yaml up -d
+```

-   ```bash
-   vllm_port=8086
-   model="meta-llama/Meta-Llama-3.1-70B-Instruct"
-   docker run -d --runtime=habana --rm --name "vllm-gaudi-server" -e HABANA_VISIBLE_DEVICES=0,1,2,3 -p $vllm_port:8000 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-gaudi:latest --model ${model} --max-seq-len-to-capture 16384 --tensor-parallel-size 4
-   ```
+### 4. Ingest Data into the vector database

-   Then launch Agent microservices.
+The `run_ingest_data.sh` script will use an example jsonl file to ingest example documents into a vector database. Other ways to ingest data and other types of documents supported can be found in the OPEA dataprep microservice located in the opea-project/GenAIComps repo.

-   ```bash
-   cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/
-   bash launch_agent_service_gaudi.sh
-   ```
+```bash
+cd  $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool/
+bash run_ingest_data.sh
+```

-   :::
-   :::{tab-item} Xeon
-   :sync: Xeon
+> **Note**: This is a one-time operation.

-   To use OpenAI models, run commands below.
+## Launch the UI

-   ```
-   export OPENAI_API_KEY=<your-openai-key>
-   cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon
-   bash launch_agent_service_openai.sh
-   ```
+Open a web browser to http://localhost:5173 to access the UI. Ensure the environment variable `AGENT_URL` is set to http://$ip_address:9090/v1/chat/completions in [ui/svelte/.env](./ui/svelte/.env) or else the UI may not work properly.

-   :::
-   ::::
+The AgentQnA UI can be deployed locally or using Docker. To customize deployment, refer to the [AgentQnA UI Guide](./ui/svelte/README.md).

-## Deploy using Helm Chart
+## [Optional] Deploy using Helm Charts

 Refer to the [AgentQnA helm chart](./kubernetes/helm/README.md) for instructions on deploying AgentQnA on Kubernetes.

-## Validate services
+## Validate Services

-First look at logs of the agent docker containers:
+1. First look at logs for each of the agent docker containers:

-```
+```bash
 # worker RAG agent
 docker logs rag-agent-endpoint

 # worker SQL agent
 docker logs sql-agent-endpoint
-```

-```
 # supervisor agent
 docker logs react-agent-endpoint
 ```

-You should see something like "HTTP server setup successful" if the docker containers are started successfully.</p>
+Look for the message "HTTP server setup successful" to confirm the agent docker container has started successfully.</p>

-Second, validate worker RAG agent:
+2. Use python to validate each agent is working properly:

-```
-curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "Michael Jackson song Thriller"
-    }'
+```bash
+# RAG worker agent
+python $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "Tell me about Michael Jackson song Thriller" --agent_role "worker" --ext_port 9095
+
+# SQL agent
+python $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "How many employees in company" --agent_role "worker" --ext_port 9096
+
+# supervisor agent: this will test a two-turn conversation
+python $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --agent_role "supervisor" --ext_port 9090
 ```

-Third, validate worker SQL agent:
+## How to register other tools with the AI agent

-```
-curl http://${host_ip}:9096/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "How many employees are in the company"
-    }'
-```
-
-Finally, validate supervisor agent:
-
-```
-curl http://${host_ip}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "How many albums does Iron Maiden have?"
-    }'
-```
-
-## Deploy AgentQnA UI
-
-The AgentQnA UI can be deployed locally or using Docker.
-
-For detailed instructions on deploying AgentQnA UI, refer to the [AgentQnA UI Guide](./ui/svelte/README.md).
-
-## How to register your own tools with agent
-
-You can take a look at the tools yaml and python files in this example. For more details, please refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/src/README.md).
+The [tools](./tools) folder contains YAML and Python files for additional tools for the supervisor and worker agents. Refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/src/README.md) to add tools and customize the AI agents.
--- a/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -8,7 +8,7 @@ services:
    ports:
      - "${AGENTQNA_TGI_SERVICE_PORT-8085}:80"
    volumes:
-      - /var/opea/agent-service/:/data
+      - ${HF_CACHE_DIR:-/var/opea/agent-service/}:/data
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
--- a/AgentQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/README.md
@@ -1,123 +1,3 @@
 # Single node on-prem deployment with Docker Compose on Xeon Scalable processors

-This example showcases a hierarchical multi-agent system for question-answering applications. We deploy the example on Xeon. For LLMs, we use OpenAI models via API calls. For instructions on using open-source LLMs, please refer to the deployment guide [here](../../../../README.md).
-
-## Deployment with docker
-
-1. First, clone this repo.
-   ```
-   export WORKDIR=<your-work-directory>
-   cd $WORKDIR
-   git clone https://github.com/opea-project/GenAIExamples.git
-   ```
-2. Set up environment for this example </br>
-
-   ```
-   # Example: host_ip="192.168.1.1" or export host_ip="External_Public_IP"
-   export host_ip=$(hostname -I | awk '{print $1}')
-   # if you are in a proxy environment, also set the proxy-related environment variables
-   export http_proxy="Your_HTTP_Proxy"
-   export https_proxy="Your_HTTPs_Proxy"
-   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
-   export no_proxy="Your_No_Proxy"
-
-   export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
-   #OPANAI_API_KEY if you want to use OpenAI models
-   export OPENAI_API_KEY=<your-openai-key>
-   ```
-
-3. Deploy the retrieval tool (i.e., DocIndexRetriever mega-service)
-
-   First, launch the mega-service.
-
-   ```
-   cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool
-   bash launch_retrieval_tool.sh
-   ```
-
-   Then, ingest data into the vector database. Here we provide an example. You can ingest your own data.
-
-   ```
-   bash run_ingest_data.sh
-   ```
-
-4. Prepare SQL database
-   In this example, we will use the SQLite database provided in the [TAG-Bench](https://github.com/TAG-Research/TAG-Bench/tree/main). Run the commands below.
-
-   ```
-   # Download data
-   cd $WORKDIR
-   git clone https://github.com/TAG-Research/TAG-Bench.git
-   cd TAG-Bench/setup
-   chmod +x get_dbs.sh
-   ./get_dbs.sh
-   ```
-
-5. Launch Tool service
-   In this example, we will use some of the mock APIs provided in the Meta CRAG KDD Challenge to demonstrate the benefits of gaining additional context from mock knowledge graphs.
-   ```
-   docker run -d -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
-   ```
-6. Launch multi-agent system
-
-   The configurations of the supervisor agent and the worker agents are defined in the docker-compose yaml file. We currently use openAI GPT-4o-mini as LLM.
-
-   ```
-   cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon
-   bash launch_agent_service_openai.sh
-   ```
-
-7. [Optional] Build `Agent` docker image if pulling images failed.
-
-   ```
-   git clone https://github.com/opea-project/GenAIComps.git
-   cd GenAIComps
-   docker build -t opea/agent:latest -f comps/agent/src/Dockerfile .
-   ```
-
-## Validate services
-
-First look at logs of the agent docker containers:
-
-```
-# worker RAG agent
-docker logs rag-agent-endpoint
-
-# worker SQL agent
-docker logs sql-agent-endpoint
-```
-
-```
-# supervisor agent
-docker logs react-agent-endpoint
-```
-
-You should see something like "HTTP server setup successful" if the docker containers are started successfully.</p>
-
-Second, validate worker RAG agent:
-
-```
-curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "Michael Jackson song Thriller"
-    }'
-```
-
-Third, validate worker SQL agent:
-
-```
-curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "How many employees are in the company?"
-    }'
-```
-
-Finally, validate supervisor agent:
-
-```
-curl http://${host_ip}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "How many albums does Iron Maiden have?"
-    }'
-```
-
-## How to register your own tools with agent
-
-You can take a look at the tools yaml and python files in this example. For more details, please refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/src/README.md).
+This example showcases a hierarchical multi-agent system for question-answering applications. To deploy the example on Xeon, OpenAI LLM models via API calls are used. For instructions, refer to the deployment guide [here](../../../../README.md).
--- a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
@@ -13,6 +13,7 @@ services:
    environment:
      ip_address: ${ip_address}
      strategy: rag_agent
+      with_memory: false
      recursion_limit: ${recursion_limit_worker}
      llm_engine: openai
      OPENAI_API_KEY: ${OPENAI_API_KEY}
@@ -35,17 +36,17 @@ services:
    image: opea/agent:latest
    container_name: sql-agent-endpoint
    volumes:
-      - ${WORKDIR}/TAG-Bench/:/home/user/TAG-Bench # SQL database
+      - ${WORKDIR}/GenAIExamples/AgentQnA/tests:/home/user/chinook-db # SQL database
    ports:
      - "9096:9096"
    ipc: host
    environment:
      ip_address: ${ip_address}
      strategy: sql_agent
+      with_memory: false
      db_name: ${db_name}
      db_path: ${db_path}
      use_hints: false
-      hints_file: /home/user/TAG-Bench/${db_name}_hints.csv
      recursion_limit: ${recursion_limit_worker}
      llm_engine: openai
      OPENAI_API_KEY: ${OPENAI_API_KEY}
@@ -64,6 +65,7 @@ services:
    container_name: react-agent-endpoint
    depends_on:
      - worker-rag-agent
+      - worker-sql-agent
    volumes:
      - ${TOOLSET_PATH}:/home/user/tools/
    ports:
@@ -71,14 +73,15 @@ services:
    ipc: host
    environment:
      ip_address: ${ip_address}
-      strategy: react_langgraph
+      strategy: react_llama
+      with_memory: true
      recursion_limit: ${recursion_limit_supervisor}
      llm_engine: openai
      OPENAI_API_KEY: ${OPENAI_API_KEY}
      model: ${model}
      temperature: ${temperature}
      max_new_tokens: ${max_new_tokens}
-      stream: false
+      stream: true
      tools: /home/user/tools/supervisor_agent_tools.yaml
      require_human_feedback: false
      no_proxy: ${no_proxy}
@@ -89,4 +92,23 @@ services:
      LANGCHAIN_PROJECT: "opea-supervisor-agent-service"
      CRAG_SERVER: $CRAG_SERVER
      WORKER_AGENT_URL: $WORKER_AGENT_URL
+      SQL_AGENT_URL: $SQL_AGENT_URL
      port: 9090
+  mock-api:
+    image: docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
+    container_name: mock-api
+    ports:
+      - "8080:8000"
+    ipc: host
+  agent-ui:
+    image: opea/agent-ui
+    container_name: agent-ui
+    volumes:
+      - ${WORKDIR}/GenAIExamples/AgentQnA/ui/svelte/.env:/home/user/svelte/.env # test db
+    ports:
+      - "5173:5173"
+    ipc: host
+
+networks:
+  default:
+    driver: bridge
--- a/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh
@@ -1,22 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-pushd "../../../../../" > /dev/null
-source .set_env.sh
-popd > /dev/null
-export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
-export ip_address=$(hostname -I | awk '{print $1}')
-export recursion_limit_worker=12
-export recursion_limit_supervisor=10
-export model="gpt-4o-mini-2024-07-18"
-export temperature=0
-export max_new_tokens=4096
-export OPENAI_API_KEY=${OPENAI_API_KEY}
-export WORKER_AGENT_URL="http://${ip_address}:9095/v1/chat/completions"
-export SQL_AGENT_URL="http://${ip_address}:9096/v1/chat/completions"
-export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool"
-export CRAG_SERVER=http://${ip_address}:8080
-export db_name=california_schools
-export db_path="sqlite:////home/user/TAG-Bench/dev_folder/dev_databases/${db_name}/${db_name}.sqlite"
-
-docker compose -f compose_openai.yaml up -d
--- a/AgentQnA/docker_compose/intel/cpu/xeon/set_env.sh
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/set_env.sh
@@ -0,0 +1,57 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+pushd "../../../../../" > /dev/null
+source .set_env.sh
+popd > /dev/null
+
+if [[ -z "${WORKDIR}" ]]; then
+	echo "Please set WORKDIR environment variable"
+	exit 0
+fi
+echo "WORKDIR=${WORKDIR}"
+export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
+export ip_address=$(hostname -I | awk '{print $1}')
+export recursion_limit_worker=12
+export recursion_limit_supervisor=10
+export model="gpt-4o-mini-2024-07-18"
+export temperature=0
+export max_new_tokens=4096
+export OPENAI_API_KEY=${OPENAI_API_KEY}
+export WORKER_AGENT_URL="http://${ip_address}:9095/v1/chat/completions"
+export SQL_AGENT_URL="http://${ip_address}:9096/v1/chat/completions"
+export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool"
+export CRAG_SERVER=http://${ip_address}:8080
+export db_name=Chinook
+export db_path="sqlite:////home/user/chinook-db/Chinook_Sqlite.sqlite"
+
+if [ ! -f $WORKDIR/GenAIExamples/AgentQnA/tests/Chinook_Sqlite.sqlite ]; then
+    echo "Download Chinook_Sqlite!"
+    wget  -O $WORKDIR/GenAIExamples/AgentQnA/tests/Chinook_Sqlite.sqlite  https://github.com/lerocha/chinook-database/releases/download/v1.4.5/Chinook_Sqlite.sqlite
+fi
+
+# retriever
+export host_ip=$(hostname -I | awk '{print $1}')
+export HF_CACHE_DIR=${HF_CACHE_DIR}
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export no_proxy=${no_proxy}
+export http_proxy=${http_proxy}
+export https_proxy=${https_proxy}
+export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+export RERANK_MODEL_ID="BAAI/bge-reranker-base"
+export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:6006"
+export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808"
+export REDIS_URL="redis://${host_ip}:6379"
+export INDEX_NAME="rag-redis"
+export RERANK_TYPE="tei"
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export EMBEDDING_SERVICE_HOST_IP=${host_ip}
+export RETRIEVER_SERVICE_HOST_IP=${host_ip}
+export RERANK_SERVICE_HOST_IP=${host_ip}
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8889/v1/retrievaltool"
+export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
+export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6008/v1/dataprep/get"
+export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6009/v1/dataprep/delete"
+
+
+export no_proxy="$no_proxy,rag-agent-endpoint,sql-agent-endpoint,react-agent-endpoint,agent-ui"
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -1,147 +1,3 @@
 # Single node on-prem deployment AgentQnA on Gaudi

-This example showcases a hierarchical multi-agent system for question-answering applications. We deploy the example on Gaudi using open-source LLMs.
-For more details, please refer to the deployment guide [here](../../../../README.md).
-
-## Deployment with docker
-
-1. First, clone this repo.
-   ```
-   export WORKDIR=<your-work-directory>
-   cd $WORKDIR
-   git clone https://github.com/opea-project/GenAIExamples.git
-   ```
-2. Set up environment for this example </br>
-
-   ```
-   # Example: host_ip="192.168.1.1" or export host_ip="External_Public_IP"
-   export host_ip=$(hostname -I | awk '{print $1}')
-   # if you are in a proxy environment, also set the proxy-related environment variables
-   export http_proxy="Your_HTTP_Proxy"
-   export https_proxy="Your_HTTPs_Proxy"
-   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
-   export no_proxy="Your_No_Proxy"
-
-   export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
-   # for using open-source llms
-   export HUGGINGFACEHUB_API_TOKEN=<your-HF-token>
-   # Example export HF_CACHE_DIR=$WORKDIR so that no need to redownload every time
-   export HF_CACHE_DIR=<directory-where-llms-are-downloaded>
-
-   ```
-
-3. Deploy the retrieval tool (i.e., DocIndexRetriever mega-service)
-
-   First, launch the mega-service.
-
-   ```
-   cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool
-   bash launch_retrieval_tool.sh
-   ```
-
-   Then, ingest data into the vector database. Here we provide an example. You can ingest your own data.
-
-   ```
-   bash run_ingest_data.sh
-   ```
-
-4. Prepare SQL database
-   In this example, we will use the Chinook SQLite database. Run the commands below.
-
-   ```
-   # Download data
-   cd $WORKDIR
-   git clone https://github.com/lerocha/chinook-database.git
-   cp chinook-database/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite $WORKDIR/GenAIExamples/AgentQnA/tests/
-   ```
-
-5. Launch Tool service
-   In this example, we will use some of the mock APIs provided in the Meta CRAG KDD Challenge to demonstrate the benefits of gaining additional context from mock knowledge graphs.
-   ```
-   docker run -d -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
-   ```
-6. Launch multi-agent system
-
-   On Gaudi2 we will serve `meta-llama/Meta-Llama-3.1-70B-Instruct` using vllm.
-
-   First build vllm-gaudi docker image.
-
-   ```bash
-   cd $WORKDIR
-   git clone https://github.com/vllm-project/vllm.git
-   cd ./vllm
-   git checkout v0.6.6
-   docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
-   ```
-
-   Then launch vllm on Gaudi2 with the command below.
-
-   ```bash
-   vllm_port=8086
-   model="meta-llama/Meta-Llama-3.1-70B-Instruct"
-   docker run -d --runtime=habana --rm --name "vllm-gaudi-server" -e HABANA_VISIBLE_DEVICES=0,1,2,3 -p $vllm_port:8000 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-gaudi:latest --model ${model} --max-seq-len-to-capture 16384 --tensor-parallel-size 4
-   ```
-
-   Then launch Agent microservices.
-
-   ```bash
-   cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/
-   bash launch_agent_service_gaudi.sh
-   ```
-
-7. [Optional] Build `Agent` docker image if pulling images failed.
-
-   If docker image pulling failed in Step 6 above, build the agent docker image with the commands below. After image build, try Step 6 again.
-
-   ```
-   git clone https://github.com/opea-project/GenAIComps.git
-   cd GenAIComps
-   docker build -t opea/agent:latest -f comps/agent/src/Dockerfile .
-   ```
-
-## Validate services
-
-First look at logs of the agent docker containers:
-
-```
-# worker RAG agent
-docker logs rag-agent-endpoint
-
-# worker SQL agent
-docker logs sql-agent-endpoint
-```
-
-```
-# supervisor agent
-docker logs react-agent-endpoint
-```
-
-You should see something like "HTTP server setup successful" if the docker containers are started successfully.</p>
-
-Second, validate worker RAG agent:
-
-```
-curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "Michael Jackson song Thriller"
-    }'
-```
-
-Third, validate worker SQL agent:
-
-```
-curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "How many employees are in the company?"
-    }'
-```
-
-Finally, validate supervisor agent:
-
-```
-curl http://${host_ip}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "How many albums does Iron Maiden have?"
-    }'
-```
-
-## How to register your own tools with agent
-
-You can take a look at the tools yaml and python files in this example. For more details, please refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/src/README.md).
+This example showcases a hierarchical multi-agent system for question-answering applications. To deploy the example on Gaudi using open-source LLMs, refer to the deployment guide [here](../../../../README.md).
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.webtool.yaml
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.webtool.yaml
@@ -0,0 +1,9 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  supervisor-react-agent:
+    environment:
+      - tools=/home/user/tools/supervisor_agent_webtools.yaml
+      - GOOGLE_CSE_ID=${GOOGLE_CSE_ID}
+      - GOOGLE_API_KEY=${GOOGLE_API_KEY}
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -13,6 +13,7 @@ services:
    environment:
      ip_address: ${ip_address}
      strategy: rag_agent_llama
+      with_memory: false
      recursion_limit: ${recursion_limit_worker}
      llm_engine: vllm
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
@@ -43,6 +44,7 @@ services:
    environment:
      ip_address: ${ip_address}
      strategy: sql_agent_llama
+      with_memory: false
      db_name: ${db_name}
      db_path: ${db_path}
      use_hints: false
@@ -74,6 +76,7 @@ services:
    environment:
      ip_address: ${ip_address}
      strategy: react_llama
+      with_memory: true
      recursion_limit: ${recursion_limit_supervisor}
      llm_engine: vllm
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
@@ -81,7 +84,7 @@ services:
      model: ${LLM_MODEL_ID}
      temperature: ${temperature}
      max_new_tokens: ${max_new_tokens}
-      stream: false
+      stream: true
      tools: /home/user/tools/supervisor_agent_tools.yaml
      require_human_feedback: false
      no_proxy: ${no_proxy}
@@ -94,3 +97,47 @@ services:
      WORKER_AGENT_URL: $WORKER_AGENT_URL
      SQL_AGENT_URL: $SQL_AGENT_URL
      port: 9090
+  mock-api:
+    image: docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
+    container_name: mock-api
+    ports:
+      - "8080:8000"
+    ipc: host
+  agent-ui:
+    image: opea/agent-ui
+    container_name: agent-ui
+    volumes:
+      - ${WORKDIR}/GenAIExamples/AgentQnA/ui/svelte/.env:/home/user/svelte/.env
+    environment:
+      host_ip: ${host_ip}
+    ports:
+      - "5173:5173"
+    ipc: host
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+    container_name: vllm-gaudi-server
+    ports:
+      - "8086:8000"
+    volumes:
+      - "./data:/data"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      VLLM_SKIP_WARMUP: true
+      PT_HPU_ENABLE_LAZY_COLLECTIVES: true
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://$host_ip:8086/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    command: --model $LLM_MODEL_ID --tensor-parallel-size 4 --host 0.0.0.0 --port 8000 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 16384
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_gaudi.sh
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_gaudi.sh
@@ -1,36 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-pushd "../../../../../" > /dev/null
-source .set_env.sh
-popd > /dev/null
-WORKPATH=$(dirname "$PWD")/..
-# export WORKDIR=$WORKPATH/../../
-echo "WORKDIR=${WORKDIR}"
-export ip_address=$(hostname -I | awk '{print $1}')
-export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-
-# LLM related environment variables
-export HF_CACHE_DIR=${HF_CACHE_DIR}
-ls $HF_CACHE_DIR
-export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-70B-Instruct"
-export NUM_SHARDS=4
-export LLM_ENDPOINT_URL="http://${ip_address}:8086"
-export temperature=0
-export max_new_tokens=4096
-
-# agent related environment variables
-export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
-echo "TOOLSET_PATH=${TOOLSET_PATH}"
-export recursion_limit_worker=12
-export recursion_limit_supervisor=10
-export WORKER_AGENT_URL="http://${ip_address}:9095/v1/chat/completions"
-export SQL_AGENT_URL="http://${ip_address}:9096/v1/chat/completions"
-export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool"
-export CRAG_SERVER=http://${ip_address}:8080
-
-export db_name=Chinook
-export db_path="sqlite:////home/user/chinook-db/Chinook_Sqlite.sqlite"
-
-docker compose -f compose.yaml up -d
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/launch_tgi_gaudi.sh
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_tgi_gaudi.sh
@@ -1,25 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# LLM related environment variables
-export HF_CACHE_DIR=${HF_CACHE_DIR}
-ls $HF_CACHE_DIR
-export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-70B-Instruct"
-export NUM_SHARDS=4
-
-docker compose -f tgi_gaudi.yaml up -d
-
-sleep 5s
-echo "Waiting tgi gaudi ready"
-n=0
-until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do
-    docker logs tgi-server &> tgi-gaudi-service.log
-    n=$((n+1))
-    if grep -q Connected tgi-gaudi-service.log; then
-        break
-    fi
-    sleep 5s
-done
-sleep 5s
-echo "Service started successfully"
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -0,0 +1,69 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+pushd "../../../../../" > /dev/null
+source .set_env.sh
+popd > /dev/null
+WORKPATH=$(dirname "$PWD")/..
+# export WORKDIR=$WORKPATH/../../
+if [[ -z "${WORKDIR}" ]]; then
+	echo "Please set WORKDIR environment variable"
+	exit 0
+fi
+echo "WORKDIR=${WORKDIR}"
+export ip_address=$(hostname -I | awk '{print $1}')
+
+# LLM related environment variables
+export HF_CACHE_DIR=${HF_CACHE_DIR}
+ls $HF_CACHE_DIR
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export LLM_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct"
+export NUM_SHARDS=4
+export LLM_ENDPOINT_URL="http://${ip_address}:8086"
+export temperature=0
+export max_new_tokens=4096
+
+# agent related environment variables
+export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
+echo "TOOLSET_PATH=${TOOLSET_PATH}"
+export recursion_limit_worker=12
+export recursion_limit_supervisor=10
+export WORKER_AGENT_URL="http://${ip_address}:9095/v1/chat/completions"
+export SQL_AGENT_URL="http://${ip_address}:9096/v1/chat/completions"
+export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool"
+export CRAG_SERVER=http://${ip_address}:8080
+
+export db_name=Chinook
+export db_path="sqlite:////home/user/chinook-db/Chinook_Sqlite.sqlite"
+if [ ! -f $WORKDIR/GenAIExamples/AgentQnA/tests/Chinook_Sqlite.sqlite ]; then
+    echo "Download Chinook_Sqlite!"
+    wget  -O $WORKDIR/GenAIExamples/AgentQnA/tests/Chinook_Sqlite.sqlite  https://github.com/lerocha/chinook-database/releases/download/v1.4.5/Chinook_Sqlite.sqlite
+fi
+
+# configure agent ui
+echo "AGENT_URL = 'http://$ip_address:9090/v1/chat/completions'" | tee ${WORKDIR}/GenAIExamples/AgentQnA/ui/svelte/.env
+
+# retriever
+export host_ip=$(hostname -I | awk '{print $1}')
+export no_proxy=${no_proxy}
+export http_proxy=${http_proxy}
+export https_proxy=${https_proxy}
+export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+export RERANK_MODEL_ID="BAAI/bge-reranker-base"
+export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:6006"
+export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808"
+export REDIS_URL="redis://${host_ip}:6379"
+export INDEX_NAME="rag-redis"
+export RERANK_TYPE="tei"
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export EMBEDDING_SERVICE_HOST_IP=${host_ip}
+export RETRIEVER_SERVICE_HOST_IP=${host_ip}
+export RERANK_SERVICE_HOST_IP=${host_ip}
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8889/v1/retrievaltool"
+export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
+export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6008/v1/dataprep/get"
+export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6009/v1/dataprep/delete"
+
+
+export no_proxy="$no_proxy,rag-agent-endpoint,sql-agent-endpoint,react-agent-endpoint,agent-ui,vllm-gaudi-server,jaeger,grafana,prometheus,127.0.0.1,localhost,0.0.0.0,$host_ip"
--- a/AgentQnA/tests/_test_compose_openai_on_xeon.sh
+++ b/AgentQnA/tests/_test_compose_openai_on_xeon.sh
@@ -20,23 +20,30 @@ function stop_agent_and_api_server() {

 function stop_retrieval_tool() {
    echo "Stopping Retrieval tool"
-    docker compose -f $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool/docker/docker-compose-retrieval-tool.yaml down
+    local RETRIEVAL_TOOL_PATH=$WORKPATH/../DocIndexRetriever
+    cd $RETRIEVAL_TOOL_PATH/docker_compose/intel/cpu/xeon/
+    container_list=$(cat compose.yaml | grep container_name | cut -d':' -f2)
+    for container_name in $container_list; do
+        cid=$(docker ps -aq --filter "name=$container_name")
+        echo "Stopping container $container_name"
+        if [[ ! -z "$cid" ]]; then docker rm $cid -f && sleep 1s; fi
+    done
 }

 echo "=================== #1 Building docker images===================="
-bash 1_build_images.sh
+bash step1_build_images.sh
 echo "=================== #1 Building docker images completed===================="

 echo "=================== #2 Start retrieval tool===================="
-bash 2_start_retrieval_tool.sh
+bash step2_start_retrieval_tool.sh
 echo "=================== #2 Retrieval tool started===================="

 echo "=================== #3 Ingest data and validate retrieval===================="
-bash 3_ingest_data_and_validate_retrieval.sh
+bash step3_ingest_data_and_validate_retrieval.sh
 echo "=================== #3 Data ingestion and validation completed===================="

 echo "=================== #4 Start agent and API server===================="
-bash 4_launch_and_validate_agent_openai.sh
+bash step4_launch_and_validate_agent_openai.sh
 echo "=================== #4 Agent test passed ===================="

 echo "=================== #5 Stop agent and API server===================="
--- a/AgentQnA/tests/step1_build_images.sh
+++ b/AgentQnA/tests/step1_build_images.sh
@@ -22,7 +22,7 @@ function build_docker_images_for_retrieval_tool(){
    echo "Build all the images with --no-cache..."
    service_list="doc-index-retriever dataprep embedding retriever reranking"
    docker compose -f build.yaml build ${service_list} --no-cache
-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.6

    docker images && sleep 1s
 }
@@ -42,7 +42,8 @@ function build_vllm_docker_image() {
        git clone https://github.com/HabanaAI/vllm-fork.git
    fi
    cd ./vllm-fork
-    git checkout v0.6.4.post2+Gaudi-1.19.0
+    VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+    git checkout ${VLLM_VER} &> /dev/null
    docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:ci --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
    if [ $? -ne 0 ]; then
        echo "opea/vllm-gaudi:ci failed"
--- a/AgentQnA/tests/step2_start_retrieval_tool.sh
+++ b/AgentQnA/tests/step2_start_retrieval_tool.sh
@@ -9,7 +9,7 @@ echo "WORKDIR=${WORKDIR}"
 export ip_address=$(hostname -I | awk '{print $1}')
 export host_ip=${ip_address}

-export HF_CACHE_DIR=$WORKDIR/hf_cache
+export HF_CACHE_DIR=${model_cache:-"$WORKDIR/hf_cache"}
 if [ ! -d "$HF_CACHE_DIR" ]; then
    echo "Creating HF_CACHE directory"
    mkdir -p "$HF_CACHE_DIR"
--- a/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
+++ b/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
@@ -11,9 +11,9 @@ export ip_address=$(hostname -I | awk '{print $1}')
 export TOOLSET_PATH=$WORKPATH/tools/
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-model="meta-llama/Meta-Llama-3.1-70B-Instruct"
+model="meta-llama/Llama-3.3-70B-Instruct" #"meta-llama/Meta-Llama-3.1-70B-Instruct"

-export HF_CACHE_DIR=/data2/huggingface
+export HF_CACHE_DIR=${model_cache:-"/data2/huggingface"}
 if [ ! -d "$HF_CACHE_DIR" ]; then
    HF_CACHE_DIR=$WORKDIR/hf_cache
    mkdir -p "$HF_CACHE_DIR"
@@ -27,18 +27,20 @@ vllm_volume=${HF_CACHE_DIR}
 function start_tgi(){
    echo "Starting tgi-gaudi server"
    cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi
-    bash launch_tgi_gaudi.sh
+    source set_env.sh
+    docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml tgi_gaudi.yaml up -d

 }

-function start_vllm_service_70B() {
+function start_all_services() {

    echo "token is ${HF_TOKEN}"

    echo "start vllm gaudi service"
    echo "**************model is $model**************"
-    vllm_image=opea/vllm-gaudi:ci
-    docker run -d --runtime=habana --rm --name "vllm-gaudi-server" -e HABANA_VISIBLE_DEVICES=0,1,2,3 -p $vllm_port:8000 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host $vllm_image --model ${model} --max-seq-len-to-capture 16384 --tensor-parallel-size 4
+    cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi
+    source set_env.sh
+    docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml up -d
    sleep 5s
    echo "Waiting vllm gaudi ready"
    n=0
@@ -60,23 +62,6 @@ function start_vllm_service_70B() {
    echo "Service started successfully"
 }

-
-function prepare_data() {
-    cd $WORKDIR
-
-    echo "Downloading data..."
-    git clone https://github.com/TAG-Research/TAG-Bench.git
-    cd TAG-Bench/setup
-    chmod +x get_dbs.sh
-    ./get_dbs.sh
-
-    echo "Split data..."
-    cd $WORKPATH/tests/sql_agent_test
-    bash run_data_split.sh
-
-    echo "Data preparation done!"
-}
-
 function download_chinook_data(){
    echo "Downloading chinook data..."
    cd $WORKDIR
@@ -84,15 +69,6 @@ function download_chinook_data(){
    cp chinook-database/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite $WORKDIR/GenAIExamples/AgentQnA/tests/
 }

-function start_agent_and_api_server() {
-    echo "Starting CRAG server"
-    docker run -d --runtime=runc --name=kdd-cup-24-crag-service -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
-
-    echo "Starting Agent services"
-    cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi
-    bash launch_agent_service_gaudi.sh
-    sleep 2m
-}

 function validate() {
    local CONTENT="$1"
@@ -112,8 +88,9 @@ function validate_agent_service() {
    # # test worker rag agent
    echo "======================Testing worker rag agent======================"
    export agent_port="9095"
+    export agent_ip="127.0.0.1"
    prompt="Tell me about Michael Jackson song Thriller"
-    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt")
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ip_addr $agent_ip  --ext_port $agent_port)
    # echo $CONTENT
    local EXIT_CODE=$(validate "$CONTENT" "Thriller" "rag-agent-endpoint")
    echo $EXIT_CODE
@@ -127,7 +104,7 @@ function validate_agent_service() {
    echo "======================Testing worker sql agent======================"
    export agent_port="9096"
    prompt="How many employees are there in the company?"
-    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt")
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ip_addr $agent_ip --ext_port $agent_port)
    local EXIT_CODE=$(validate "$CONTENT" "8" "sql-agent-endpoint")
    echo $CONTENT
    # echo $EXIT_CODE
@@ -140,9 +117,8 @@ function validate_agent_service() {
    # test supervisor react agent
    echo "======================Testing supervisor react agent======================"
    export agent_port="9090"
-    prompt="How many albums does Iron Maiden have?"
-    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt")
-    local EXIT_CODE=$(validate "$CONTENT" "21" "react-agent-endpoint")
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --agent_role "supervisor" --ip_addr $agent_ip  --ext_port $agent_port --stream)
+    local EXIT_CODE=$(validate "$CONTENT" "Iron" "react-agent-endpoint")
    # echo $CONTENT
    echo $EXIT_CODE
    local EXIT_CODE="${EXIT_CODE:0-1}"
@@ -153,15 +129,6 @@ function validate_agent_service() {

 }

-function remove_data() {
-    echo "Removing data..."
-    cd $WORKDIR
-    if [ -d "TAG-Bench" ]; then
-        rm -rf TAG-Bench
-    fi
-    echo "Data removed!"
-}
-
 function remove_chinook_data(){
    echo "Removing chinook data..."
    cd $WORKDIR
@@ -171,26 +138,77 @@ function remove_chinook_data(){
    echo "Chinook data removed!"
 }

+export host_ip=$ip_address
+echo "ip_address=${ip_address}"
+
+
+function validate() {
+    local CONTENT="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+
+    if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+        echo "[ $SERVICE_NAME ] Content is as expected: $CONTENT"
+        echo 0
+    else
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+        echo 1
+    fi
+}
+
+function ingest_data_and_validate() {
+    echo "Ingesting data"
+    cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool/
+    echo $PWD
+    local CONTENT=$(bash run_ingest_data.sh)
+    local EXIT_CODE=$(validate "$CONTENT" "Data preparation succeeded" "dataprep-redis-server")
+    echo "$EXIT_CODE"
+    local EXIT_CODE="${EXIT_CODE:0-1}"
+    echo "return value is $EXIT_CODE"
+    if [ "$EXIT_CODE" == "1" ]; then
+        docker logs dataprep-redis-server
+        return 1
+    fi
+}
+
+function validate_retrieval_tool() {
+    echo "----------------Test retrieval tool ----------------"
+    local CONTENT=$(http_proxy="" curl http://${ip_address}:8889/v1/retrievaltool -X POST -H "Content-Type: application/json" -d '{
+     "text": "Who sang Thriller"
+    }')
+    local EXIT_CODE=$(validate "$CONTENT" "Thriller" "retrieval-tool")
+
+    if [ "$EXIT_CODE" == "1" ]; then
+        docker logs retrievaltool-xeon-backend-server
+        exit 1
+    fi
+}
+
 function main() {
    echo "==================== Prepare data ===================="
    download_chinook_data
    echo "==================== Data prepare done ===================="

-    echo "==================== Start VLLM service ===================="
-    start_vllm_service_70B
-    echo "==================== VLLM service started ===================="
+    echo "==================== Start all services ===================="
+    start_all_services
+    echo "==================== all services started ===================="

-    echo "==================== Start agent ===================="
-    start_agent_and_api_server
-    echo "==================== Agent started ===================="
+    echo "==================== Ingest data ===================="
+    ingest_data_and_validate
+    echo "==================== Data ingestion completed ===================="
+
+    echo "==================== Validate retrieval tool ===================="
+    validate_retrieval_tool
+    echo "==================== Retrieval tool validated ===================="

    echo "==================== Validate agent service ===================="
    validate_agent_service
    echo "==================== Agent service validated ===================="
 }

-remove_data
+
 remove_chinook_data
+
 main
-remove_data
+
 remove_chinook_data
--- a/AgentQnA/tests/step4_launch_and_validate_agent_openai.sh
+++ b/AgentQnA/tests/step4_launch_and_validate_agent_openai.sh
@@ -11,13 +11,22 @@ echo "WORKDIR=${WORKDIR}"
 export ip_address=$(hostname -I | awk '{print $1}')
 export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/

+
+function download_chinook_data(){
+    echo "Downloading chinook data..."
+    cd $WORKDIR
+    git clone https://github.com/lerocha/chinook-database.git
+    cp chinook-database/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite $WORKDIR/GenAIExamples/AgentQnA/tests/
+}
+
 function start_agent_and_api_server() {
    echo "Starting CRAG server"
    docker run -d --runtime=runc --name=kdd-cup-24-crag-service -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0

    echo "Starting Agent services"
-    cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon
+    cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon/
    bash launch_agent_service_openai.sh
+    sleep 2m
 }

 function validate() {
@@ -35,19 +44,64 @@ function validate() {
 }

 function validate_agent_service() {
-    echo "----------------Test agent ----------------"
-    local CONTENT=$(http_proxy="" curl http://${ip_address}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "query": "Tell me about Michael Jackson song thriller"
-    }')
-    local EXIT_CODE=$(validate "$CONTENT" "Thriller" "react-agent-endpoint")
-    docker logs react-agent-endpoint
+    # # test worker rag agent
+    echo "======================Testing worker rag agent======================"
+    export agent_port="9095"
+    prompt="Tell me about Michael Jackson song Thriller"
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ext_port $agent_port)
+    # echo $CONTENT
+    local EXIT_CODE=$(validate "$CONTENT" "Thriller" "rag-agent-endpoint")
+    echo $EXIT_CODE
+    local EXIT_CODE="${EXIT_CODE:0-1}"
    if [ "$EXIT_CODE" == "1" ]; then
+        docker logs rag-agent-endpoint
+        exit 1
+    fi
+
+    # # test worker sql agent
+    echo "======================Testing worker sql agent======================"
+    export agent_port="9096"
+    prompt="How many employees are there in the company?"
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ext_port $agent_port)
+    local EXIT_CODE=$(validate "$CONTENT" "8" "sql-agent-endpoint")
+    echo $CONTENT
+    # echo $EXIT_CODE
+    local EXIT_CODE="${EXIT_CODE:0-1}"
+    if [ "$EXIT_CODE" == "1" ]; then
+        docker logs sql-agent-endpoint
+        exit 1
+    fi
+
+    # test supervisor react agent
+    echo "======================Testing supervisor react agent======================"
+    export agent_port="9090"
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --agent_role "supervisor" --ext_port $agent_port --stream)
+    local EXIT_CODE=$(validate "$CONTENT" "Iron" "react-agent-endpoint")
+    # echo $CONTENT
+    echo $EXIT_CODE
+    local EXIT_CODE="${EXIT_CODE:0-1}"
+    if [ "$EXIT_CODE" == "1" ]; then
+        docker logs react-agent-endpoint
        exit 1
    fi

 }

+function remove_chinook_data(){
+    echo "Removing chinook data..."
+    cd $WORKDIR
+    if [ -d "chinook-database" ]; then
+        rm -rf chinook-database
+    fi
+    echo "Chinook data removed!"
+}
+
+
 function main() {
+    echo "==================== Prepare data ===================="
+    download_chinook_data
+    echo "==================== Data prepare done ===================="
+
    echo "==================== Start agent ===================="
    start_agent_and_api_server
    echo "==================== Agent started ===================="
@@ -57,4 +111,9 @@ function main() {
    echo "==================== Agent service validated ===================="
 }

+
+remove_chinook_data
+
 main
+
+remove_chinook_data
--- a/AgentQnA/tests/step4a_launch_and_validate_agent_tgi_on_rocm.sh
+++ b/AgentQnA/tests/step4a_launch_and_validate_agent_tgi_on_rocm.sh
@@ -11,7 +11,7 @@ export ip_address=$(hostname -I | awk '{print $1}')
 export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}

-export HF_CACHE_DIR=$WORKDIR/hf_cache
+export HF_CACHE_DIR=${model_cache:-"$WORKDIR/hf_cache"}
 if [ ! -d "$HF_CACHE_DIR" ]; then
    mkdir -p "$HF_CACHE_DIR"
 fi
--- a/AgentQnA/tests/test.py
+++ b/AgentQnA/tests/test.py
@@ -1,34 +1,20 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

 import argparse
-import os
+import json
+import uuid

 import requests


-def generate_answer_agent_api(url, prompt):
-    proxies = {"http": ""}
-    payload = {
-        "messages": prompt,
-    }
-    response = requests.post(url, json=payload, proxies=proxies)
-    answer = response.json()["text"]
-    return answer
-
-
 def process_request(url, query, is_stream=False):
    proxies = {"http": ""}
-
-    payload = {
-        "messages": query,
-    }
-
+    content = json.dumps(query) if query is not None else None
    try:
-        resp = requests.post(url=url, json=payload, proxies=proxies, stream=is_stream)
+        resp = requests.post(url=url, data=content, proxies=proxies, stream=is_stream)
        if not is_stream:
            ret = resp.json()["text"]
-            print(ret)
        else:
            for line in resp.iter_lines(decode_unicode=True):
                print(line)
@@ -38,19 +24,54 @@ def process_request(url, query, is_stream=False):
        return ret
    except requests.exceptions.RequestException as e:
        ret = f"An error occurred:{e}"
-        print(ret)
-        return False
+        return None
+
+
+def test_worker_agent(args):
+    url = f"http://{args.ip_addr}:{args.ext_port}/v1/chat/completions"
+    query = {"role": "user", "messages": args.prompt, "stream": "false"}
+    ret = process_request(url, query)
+    print("Response: ", ret)
+
+
+def add_message_and_run(url, user_message, thread_id, stream=False):
+    print("User message: ", user_message)
+    query = {"role": "user", "messages": user_message, "thread_id": thread_id, "stream": stream}
+    ret = process_request(url, query, is_stream=stream)
+    print("Response: ", ret)
+
+
+def test_chat_completion_multi_turn(args):
+    url = f"http://{args.ip_addr}:{args.ext_port}/v1/chat/completions"
+    thread_id = f"{uuid.uuid4()}"
+
+    # first turn
+    print("===============First turn==================")
+    user_message = "Which artist has the most albums in the database?"
+    add_message_and_run(url, user_message, thread_id, stream=args.stream)
+    print("===============End of first turn==================")
+
+    # second turn
+    print("===============Second turn==================")
+    user_message = "Give me a few examples of the artist's albums?"
+    add_message_and_run(url, user_message, thread_id, stream=args.stream)
+    print("===============End of second turn==================")


 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--prompt", type=str)
-    parser.add_argument("--stream", action="store_true")
-    args = parser.parse_args()
+    parser.add_argument("--ip_addr", type=str, default="127.0.0.1", help="endpoint ip address")
+    parser.add_argument("--ext_port", type=str, default="9090", help="endpoint port")
+    parser.add_argument("--stream", action="store_true", help="streaming mode")
+    parser.add_argument("--prompt", type=str, help="prompt message")
+    parser.add_argument("--agent_role", type=str, default="supervisor", help="supervisor or worker")
+    args, _ = parser.parse_known_args()

-    ip_address = os.getenv("ip_address", "localhost")
-    agent_port = os.getenv("agent_port", "9090")
-    url = f"http://{ip_address}:{agent_port}/v1/chat/completions"
-    prompt = args.prompt
+    print(args)

-    process_request(url, prompt, args.stream)
+    if args.agent_role == "supervisor":
+        test_chat_completion_multi_turn(args)
+    elif args.agent_role == "worker":
+        test_worker_agent(args)
+    else:
+        raise ValueError("Invalid agent role")
--- a/AgentQnA/tests/test_compose_on_gaudi.sh
+++ b/AgentQnA/tests/test_compose_on_gaudi.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-
 set -xe

 WORKPATH=$(dirname "$PWD")
@@ -10,6 +9,22 @@ echo "WORKDIR=${WORKDIR}"
 export ip_address=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
+export no_proxy="$no_proxy,rag-agent-endpoint,sql-agent-endpoint,react-agent-endpoint,agent-ui,vllm-gaudi-server,jaeger,grafana,prometheus,127.0.0.1,localhost,0.0.0.0,$ip_address"
+
+
+function get_genai_comps() {
+    if [ ! -d "GenAIComps" ] ; then
+        git clone --depth 1 --branch ${opea_branch:-"main"} https://github.com/opea-project/GenAIComps.git
+    fi
+}
+
+
+function build_agent_docker_image() {
+    cd $WORKDIR/GenAIExamples/AgentQnA/docker_image_build/
+    get_genai_comps
+    echo "Build agent image with --no-cache..."
+    docker compose -f build.yaml build --no-cache
+}

 function stop_crag() {
    cid=$(docker ps -aq --filter "name=kdd-cup-24-crag-service")
@@ -19,12 +34,7 @@ function stop_crag() {

 function stop_agent_docker() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi/
-    container_list=$(cat compose.yaml | grep container_name | cut -d':' -f2)
-    for container_name in $container_list; do
-        cid=$(docker ps -aq --filter "name=$container_name")
-        echo "Stopping container $container_name"
-        if [[ ! -z "$cid" ]]; then docker rm $cid -f && sleep 1s; fi
-    done
+    docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml down
 }

 function stop_llm(){
@@ -60,33 +70,21 @@ function stop_retrieval_tool() {
 echo "workpath: $WORKPATH"
 echo "=================== Stop containers ===================="
 stop_crag
-stop_llm
 stop_agent_docker
-stop_retrieval_tool

 cd $WORKPATH/tests

 echo "=================== #1 Building docker images===================="
-bash step1_build_images.sh
+build_agent_docker_image
 echo "=================== #1 Building docker images completed===================="

-echo "=================== #2 Start retrieval tool===================="
-bash step2_start_retrieval_tool.sh
-echo "=================== #2 Retrieval tool started===================="
-
-echo "=================== #3 Ingest data and validate retrieval===================="
-bash step3_ingest_data_and_validate_retrieval.sh
-echo "=================== #3 Data ingestion and validation completed===================="
-
-echo "=================== #4 Start agent and API server===================="
-bash step4_launch_and_validate_agent_tgi.sh
-echo "=================== #4 Agent test passed ===================="
+echo "=================== #4 Start agent, API server, retrieval, and ingest data===================="
+bash $WORKPATH/tests/step4_launch_and_validate_agent_gaudi.sh
+echo "=================== #4 Agent, retrieval test passed ===================="

 echo "=================== #5 Stop agent and API server===================="
 stop_crag
 stop_agent_docker
-stop_retrieval_tool
-stop_llm
 echo "=================== #5 Agent and API server stopped===================="

 echo y | docker system prune
--- a/AgentQnA/tools/supervisor_agent_webtools.yaml
+++ b/AgentQnA/tools/supervisor_agent_webtools.yaml
@@ -0,0 +1,77 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+search_web_base:
+  description: Search a web base for a given query. Returns text related to the query.
+  callable_api: tools.py:search_web_base
+  args_schema:
+    query:
+      type: str
+      description: query
+  return_output: retrieved_data
+
+search_knowledge_base:
+  description: Search a knowledge base for a given query. Returns text related to the query.
+  callable_api: tools.py:search_knowledge_base
+  args_schema:
+    query:
+      type: str
+      description: query
+  return_output: retrieved_data
+
+search_artist_database:
+  description: Search a SQL database on artists and their music with a natural language query. Returns text related to the query.
+  callable_api: tools.py:search_sql_database
+  args_schema:
+    query:
+      type: str
+      description: natural language query
+  return_output: retrieved_data
+
+get_artist_birth_place:
+  description: Get the birth place of an artist.
+  callable_api: tools.py:get_artist_birth_place
+  args_schema:
+    artist_name:
+      type: str
+      description: artist name
+  return_output: birth_place
+
+get_billboard_rank_date:
+  description: Get Billboard ranking for a specific rank and date.
+  callable_api: tools.py:get_billboard_rank_date
+  args_schema:
+    rank:
+      type: int
+      description: the rank of interest, for example 1 for top 1
+    date:
+      type: str
+      description: date
+  return_output: billboard_info
+
+get_song_release_date:
+  description: Get the release date of a song.
+  callable_api: tools.py:get_song_release_date
+  args_schema:
+    song_name:
+      type: str
+      description: song name
+  return_output: release_date
+
+get_members:
+  description: Get the member list of a band.
+  callable_api: tools.py:get_members
+  args_schema:
+    band_name:
+      type: str
+      description: band name
+  return_output: members
+
+get_grammy_best_artist_by_year:
+  description: Get the Grammy Best New Artist for a specific year.
+  callable_api: tools.py:get_grammy_best_artist_by_year
+  args_schema:
+    year:
+      type: int
+      description: year
+  return_output: grammy_best_new_artist
--- a/AgentQnA/tools/tools.py
+++ b/AgentQnA/tools/tools.py
@@ -7,6 +7,24 @@ import requests
 from tools.pycragapi import CRAG


+def search_web_base(query: str) -> str:
+    import os
+
+    from langchain_core.tools import Tool
+    from langchain_google_community import GoogleSearchAPIWrapper
+
+    search = GoogleSearchAPIWrapper()
+
+    tool = Tool(
+        name="google_search",
+        description="Search Google for recent results.",
+        func=search.run,
+    )
+
+    response = tool.run(query)
+    return response
+
+
 def search_knowledge_base(query: str) -> str:
    """Search a knowledge base about music and singers for a given query.

--- a/AgentQnA/ui/svelte/README.md
+++ b/AgentQnA/ui/svelte/README.md
@@ -18,13 +18,25 @@ Here're some of the project's features:
 2. cd command to the current folder.

   ```
-   cd AgentQnA/ui
+   cd AgentQnA/ui/svelte
   ```

-3. Modify the required .env variables.
+3. Modify the required .env variables. The `AGENT_URL` should be in the form of the following:

   ```
-   AGENT_URL = ''
+   AGENT_URL = "http://${ip_address}:${agent_port}/v1/chat/completions"
+   ```
+
+   For example: assume that the ip address of the host machine is 10.10.10.1, and the agent port is 9090,then
+
+   ```
+   AGENT_URL = "http://10.10.10.1:9090/v1/chat/completions"
+   ```
+
+   You can get the ip address of the host machine by running the command below:
+
+   ```bash
+    export ip_address=$(hostname -I | awk '{print $1}')
   ```

 4. **For Local Development:**
@@ -41,7 +53,7 @@ Here're some of the project's features:
  npm run dev
  ```

- The application will be available at `http://localhost:3000`.
+- The application will be available at `http://localhost:5173`.

 5. **For Docker Setup:**

@@ -54,7 +66,7 @@ Here're some of the project's features:
 - Run the Docker container:

  ```
-  docker run -d -p 3000:3000 --name agent-ui opea:agent-ui
+  docker run -d -p 5173:5173 --name agent-ui opea:agent-ui
  ```

- The application will be available at `http://localhost:3000`.
+- The application will be available at `http://${ip_address}:5173`. You can access it with a web browser on your laptop. Note the `ip_address` should be the ip address of the host machine where the UI container runs.
--- a/AgentQnA/ui/svelte/src/lib/components/home.svelte
+++ b/AgentQnA/ui/svelte/src/lib/components/home.svelte
@@ -108,7 +108,7 @@
 			<!-- svelte-ignore a11y-click-events-have-key-events -->
 			<div
 				class="relative rounded-xl bg-white p-2 py-8 pl-16"
-				on:click={() => handleCreate(feature)}
+				on:click={() => handleCreate(feature.description)}
 			>
 				<dt class="text-base font-semibold text-gray-900">
 					<div
--- a/AudioQnA/Dockerfile
+++ b/AudioQnA/Dockerfile
@@ -1,48 +1,8 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p $HOME && \
-    chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG

 COPY ./audioqna.py $HOME/audioqna.py

--- a/AudioQnA/Dockerfile.multilang
+++ b/AudioQnA/Dockerfile.multilang
@@ -1,48 +1,8 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p $HOME && \
-    chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG

 COPY ./audioqna_multilang.py $HOME/audioqna_multilang.py

--- a/AudioQnA/audioqna.py
+++ b/AudioQnA/audioqna.py
@@ -16,13 +16,14 @@ SPEECHT5_SERVER_HOST_IP = os.getenv("SPEECHT5_SERVER_HOST_IP", "0.0.0.0")
 SPEECHT5_SERVER_PORT = int(os.getenv("SPEECHT5_SERVER_PORT", 7055))
 LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
 LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 3006))
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")


 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
    if self.services[cur_node].service_type == ServiceType.LLM:
        # convert TGI/vLLM to unified OpenAI /v1/chat/completions format
        next_inputs = {}
-        next_inputs["model"] = "tgi"  # specifically clarify the fake model to make the format unified
+        next_inputs["model"] = LLM_MODEL_ID
        next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
        next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
        next_inputs["top_p"] = llm_parameters_dict["top_p"]
--- a/AudioQnA/audioqna_multilang.py
+++ b/AudioQnA/audioqna_multilang.py
@@ -17,6 +17,7 @@ GPT_SOVITS_SERVER_HOST_IP = os.getenv("GPT_SOVITS_SERVER_HOST_IP", "0.0.0.0")
 GPT_SOVITS_SERVER_PORT = int(os.getenv("GPT_SOVITS_SERVER_PORT", 9088))
 LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
 LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 8888))
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")


 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
@@ -24,7 +25,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
    if self.services[cur_node].service_type == ServiceType.LLM:
        # convert TGI/vLLM to unified OpenAI /v1/chat/completions format
        next_inputs = {}
-        next_inputs["model"] = "tgi"  # specifically clarify the fake model to make the format unified
+        next_inputs["model"] = LLM_MODEL_ID
        next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
        next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
        next_inputs["top_p"] = llm_parameters_dict["top_p"]
--- a/AudioQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/README.md
@@ -3,104 +3,317 @@
 This document outlines the deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice
 pipeline on server on AMD ROCm GPU platform.

-## 🚀 Build Docker images
+## Build Docker Images

-### 1. Source Code install GenAIComps
+### 1. Build Docker Image
+
+- #### Create application install directory and go to it:
+
+  ```bash
+  mkdir ~/audioqna-install && cd audioqna-install
+  ```
+
+- #### Clone the repository GenAIExamples (the default repository branch "main" is used here):
+
+  ```bash
+  git clone https://github.com/opea-project/GenAIExamples.git
+  ```
+
+  If you need to use a specific branch/tag of the GenAIExamples repository, then (v1.3 replace with its own value):
+
+  ```bash
+  git clone https://github.com/opea-project/GenAIExamples.git && cd GenAIExamples && git checkout v1.3
+  ```
+
+  We remind you that when using a specific version of the code, you need to use the README from this version:
+
+- #### Go to build directory:
+
+  ```bash
+  cd ~/audioqna-install/GenAIExamples/AudioQnA/docker_image_build
+  ```
+
+- Cleaning up the GenAIComps repository if it was previously cloned in this directory.
+  This is necessary if the build was performed earlier and the GenAIComps folder exists and is not empty:
+
+  ```bash
+  echo Y | rm -R GenAIComps
+  ```
+
+- #### Clone the repository GenAIComps (the default repository branch "main" is used here):

 ```bash
 git clone https://github.com/opea-project/GenAIComps.git
 cd GenAIComps
 ```

-### 2. Build ASR Image
+We remind you that when using a specific version of the code, you need to use the README from this version.

-```bash
-docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .
+- #### Setting the list of images for the build (from the build file.yaml)
+
+  If you want to deploy a vLLM-based or TGI-based application, then the set of services is installed as follows:
+
+  #### vLLM-based application
+
+  ```bash
+  service_list="vllm-rocm whisper speecht5 audioqna audioqna-ui"
+  ```
+
+  #### TGI-based application
+
+  ```bash
+  service_list="whisper speecht5 audioqna audioqna-ui"
+  ```
+
+- #### Optional. Pull TGI Docker Image (Do this if you want to use TGI)
+
+  ```bash
+  docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+  ```
+
+- #### Build Docker Images
+
+  ```bash
+  docker compose -f build.yaml build ${service_list} --no-cache
+  ```
+
+  After the build, we check the list of images with the command:
+
+  ```bash
+  docker image ls
+  ```
+
+  The list of images should include:
+
+  ##### vLLM-based application:
+
+  - opea/vllm-rocm:latest
+    - opea/whisper:latest
+    - opea/speecht5:latest
+    - opea/audioqna:latest
+
+  ##### TGI-based application:
+
+  - ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    - opea/whisper:latest
+    - opea/speecht5:latest
+    - opea/audioqna:latest
+
+---
+
+## Deploy the AudioQnA Application
+
+### Docker Compose Configuration for AMD GPUs
+
+To enable GPU support for AMD GPUs, the following configuration is added to the Docker Compose file:
+
+- compose_vllm.yaml - for vLLM-based application
+- compose.yaml - for TGI-based
+
+```yaml
+shm_size: 1g
+devices:
+  - /dev/kfd:/dev/kfd
+  - /dev/dri/:/dev/dri/
+cap_add:
+  - SYS_PTRACE
+group_add:
+  - video
+security_opt:
+  - seccomp:unconfined
 ```

-### 3. Build LLM Image
+This configuration forwards all available GPUs to the container. To use a specific GPU, specify its `cardN` and `renderN` device IDs. For example:

-For compose for ROCm example AMD optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm (https://github.com/huggingface/text-generation-inference)
-
-### 4. Build TTS Image
-
-```bash
-docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile .
+```yaml
+shm_size: 1g
+devices:
+  - /dev/kfd:/dev/kfd
+  - /dev/dri/card0:/dev/dri/card0
+  - /dev/dri/render128:/dev/dri/render128
+cap_add:
+  - SYS_PTRACE
+group_add:
+  - video
+security_opt:
+  - seccomp:unconfined
 ```

-### 5. Build MegaService Docker Image
+**How to Identify GPU Device IDs:**
+Use AMD GPU driver utilities to determine the correct `cardN` and `renderN` IDs for your GPU.

-To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `audioqna.py` Python script. Build the MegaService Docker image using the command below:
+### Set deploy environment variables
+
+#### Setting variables in the operating system environment:
+
+##### Set variable HUGGINGFACEHUB_API_TOKEN:

 ```bash
-git clone https://github.com/opea-project/GenAIExamples.git
-cd GenAIExamples/AudioQnA/
-docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+### Replace the string 'your_huggingfacehub_token' with your HuggingFacehub repository access token.
+export HUGGINGFACEHUB_API_TOKEN='your_huggingfacehub_token'
 ```

-Then run the command `docker images`, you will have following images ready:
+#### Set variables value in set_env\*\*\*\*.sh file:

-1. `opea/whisper:latest`
-2. `opea/speecht5:latest`
-3. `opea/audioqna:latest`
-
-## 🚀 Set the environment variables
-
-Before starting the services with `docker compose`, you have to recheck the following environment variables.
+Go to Docker Compose directory:

 ```bash
-export host_ip=<your External Public IP>    # export host_ip=$(hostname -I | awk '{print $1}')
-export HUGGINGFACEHUB_API_TOKEN=<your HF token>
-
-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
-
-export MEGA_SERVICE_HOST_IP=${host_ip}
-export WHISPER_SERVER_HOST_IP=${host_ip}
-export SPEECHT5_SERVER_HOST_IP=${host_ip}
-export LLM_SERVER_HOST_IP=${host_ip}
-
-export WHISPER_SERVER_PORT=7066
-export SPEECHT5_SERVER_PORT=7055
-export LLM_SERVER_PORT=3006
-
-export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
+cd ~/audioqna-install/GenAIExamples/AudioQnA/docker_compose/amd/gpu/rocm
 ```

-or use set_env.sh file to setup environment variables.
+The example uses the Nano text editor. You can use any convenient text editor:

-Note: Please replace with host_ip with your external IP address, do not use localhost.
-
-Note: In order to limit access to a subset of GPUs, please pass each device individually using one or more -device /dev/dri/rendered, where is the card index, starting from 128. (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-Example for set isolation for 1 GPU
-
-      - /dev/dri/card0:/dev/dri/card0
-      - /dev/dri/renderD128:/dev/dri/renderD128
-
-Example for set isolation for 2 GPUs
-
-      - /dev/dri/card0:/dev/dri/card0
-      - /dev/dri/renderD128:/dev/dri/renderD128
-      - /dev/dri/card0:/dev/dri/card0
-      - /dev/dri/renderD129:/dev/dri/renderD129
-
-Please find more information about accessing and restricting AMD GPUs in the link (https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html#docker-restrict-gpus)
-
-## 🚀 Start the MegaService
+#### If you use vLLM

 ```bash
-cd GenAIExamples/AudioQnA/docker_compose/amd/gpu/rocm/
-docker compose up -d
+nano set_env_vllm.sh
 ```

-In following cases, you could build docker image from source by yourself.
+#### If you use TGI

- Failed to download the docker image.
- If you want to use a specific version of Docker image.
+```bash
+nano set_env.sh
+```

-Please refer to 'Build Docker Images' in below.
+If you are in a proxy environment, also set the proxy-related environment variables:

-## 🚀 Consume the AudioQnA Service
+```bash
+export http_proxy="Your_HTTP_Proxy"
+export https_proxy="Your_HTTPs_Proxy"
+```
+
+Set the values of the variables:
+
+- **HOST_IP, HOST_IP_EXTERNAL** - These variables are used to configure the name/address of the service in the operating system environment for the application services to interact with each other and with the outside world.
+
+  If your server uses only an internal address and is not accessible from the Internet, then the values for these two variables will be the same and the value will be equal to the server's internal name/address.
+
+  If your server uses only an external, Internet-accessible address, then the values for these two variables will be the same and the value will be equal to the server's external name/address.
+
+  If your server is located on an internal network, has an internal address, but is accessible from the Internet via a proxy/firewall/load balancer, then the HOST_IP variable will have a value equal to the internal name/address of the server, and the EXTERNAL_HOST_IP variable will have a value equal to the external name/address of the proxy/firewall/load balancer behind which the server is located.
+
+  We set these values in the file set_env\*\*\*\*.sh
+
+- **Variables with names like "**\*\*\*\*\*\*\_PORT"\*\* - These variables set the IP port numbers for establishing network connections to the application services.
+  The values shown in the file set_env.sh or set_env_vllm they are the values used for the development and testing of the application, as well as configured for the environment in which the development is performed. These values must be configured in accordance with the rules of network access to your environment's server, and must not overlap with the IP ports of other applications that are already in use.
+
+#### Set variables with script set_env\*\*\*\*.sh
+
+#### If you use vLLM
+
+```bash
+. set_env_vllm.sh
+```
+
+#### If you use TGI
+
+```bash
+. set_env.sh
+```
+
+### Start the services:
+
+#### If you use vLLM
+
+```bash
+docker compose -f compose_vllm.yaml up -d
+```
+
+#### If you use TGI
+
+```bash
+docker compose -f compose.yaml up -d
+```
+
+All containers should be running and should not restart:
+
+##### If you use vLLM:
+
+- audioqna-vllm-service
+- whisper-service
+- speecht5-service
+- audioqna-backend-server
+- audioqna-ui-server
+
+##### If you use TGI:
+
+- audioqna-tgi-service
+- whisper-service
+- speecht5-service
+- audioqna-backend-server
+- audioqna-ui-server
+
+---
+
+## Validate the Services
+
+### 1. Validate the vLLM/TGI Service
+
+#### If you use vLLM:
+
+```bash
+DATA='{"model": "Intel/neural-chat-7b-v3-3t", '\
+'"messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 256}'
+
+curl http://${HOST_IP}:${AUDIOQNA_VLLM_SERVICE_PORT}/v1/chat/completions \
+  -X POST \
+  -d "$DATA" \
+  -H 'Content-Type: application/json'
+```
+
+Checking the response from the service. The response should be similar to JSON:
+
+```json
+{
+  "id": "chatcmpl-142f34ef35b64a8db3deedd170fed951",
+  "object": "chat.completion",
+  "created": 1742270316,
+  "model": "Intel/neural-chat-7b-v3-3",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "",
+        "tool_calls": []
+      },
+      "logprobs": null,
+      "finish_reason": "length",
+      "stop_reason": null
+    }
+  ],
+  "usage": { "prompt_tokens": 66, "total_tokens": 322, "completion_tokens": 256, "prompt_tokens_details": null },
+  "prompt_logprobs": null
+}
+```
+
+If the service response has a meaningful response in the value of the "choices.message.content" key,
+then we consider the vLLM service to be successfully launched
+
+#### If you use TGI:
+
+```bash
+DATA='{"inputs":"What is Deep Learning?",'\
+'"parameters":{"max_new_tokens":256,"do_sample": true}}'
+
+curl http://${HOST_IP}:${AUDIOQNA_TGI_SERVICE_PORT}/generate \
+  -X POST \
+  -d "$DATA" \
+  -H 'Content-Type: application/json'
+```
+
+Checking the response from the service. The response should be similar to JSON:
+
+```json
+{
+  "generated_text": " "
+}
+```
+
+If the service response has a meaningful response in the value of the "generated_text" key,
+then we consider the TGI service to be successfully launched
+
+### 2. Validate MegaServices

 Test the AudioQnA megaservice by recording a .wav file, encoding the file into the base64 format, and then sending the
 base64 string to the megaservice endpoint. The megaservice will return a spoken response as a base64 string. To listen
@@ -114,7 +327,7 @@ curl http://${host_ip}:3008/v1/audioqna \
  -H 'Content-Type: application/json' | sed 's/^"//;s/"$//' | base64 -d > output.wav
 ```

-## 🚀 Test MicroServices
+### 3. Validate MicroServices

 ```bash
 # whisper service
@@ -123,15 +336,25 @@ curl http://${host_ip}:7066/v1/asr \
  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
  -H 'Content-Type: application/json'

-# tgi service
-curl http://${host_ip}:3006/generate \
-  -X POST \
-  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-  -H 'Content-Type: application/json'
-
 # speecht5 service
 curl http://${host_ip}:7055/v1/tts \
  -X POST \
  -d '{"text": "Who are you?"}' \
  -H 'Content-Type: application/json'
 ```
+
+### 4. Stop application
+
+#### If you use vLLM
+
+```bash
+cd ~/audioqna-install/GenAIExamples/AudioQnA/docker_compose/amd/gpu/rocm
+docker compose -f compose_vllm.yaml down
+```
+
+#### If you use TGI
+
+```bash
+cd ~/audioqna-install/GenAIExamples/AudioQnA/docker_compose/amd/gpu/rocm
+docker compose -f compose.yaml down
+```
--- a/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -69,6 +69,7 @@ services:
      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
+      - LLM_MODEL_ID=${LLM_MODEL_ID}
      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
    ipc: host
--- a/AudioQnA/docker_compose/amd/gpu/rocm/compose_vllm.yaml
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/compose_vllm.yaml
@@ -0,0 +1,101 @@
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  whisper-service:
+    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+    container_name: whisper-service
+    ports:
+      - "${WHISPER_SERVER_PORT:-7066}:7066"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+
+  speecht5-service:
+    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
+    container_name: speecht5-service
+    ports:
+      - "${SPEECHT5_SERVER_PORT:-7055}:7055"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+
+  audioqna-vllm-service:
+    image: ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest}
+    container_name: audioqna-vllm-service
+    ports:
+      - "${VLLM_SERVICE_PORT:-8081}:8011"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+      WILM_USE_TRITON_FLASH_ATTENTION: 0
+      PYTORCH_JIT: 0
+    volumes:
+      - "${HF_CACHE_DIR:-./data}:/data"
+    shm_size: 20G
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri/:/dev/dri/
+    cap_add:
+      - SYS_PTRACE
+    group_add:
+      - video
+    security_opt:
+      - seccomp:unconfined
+      - apparmor=unconfined
+    command: "--model ${LLM_MODEL_ID} --swap-space 16 --disable-log-requests --dtype float16 --tensor-parallel-size 1 --host 0.0.0.0 --port 8011 --num-scheduler-steps 1 --distributed-executor-backend \"mp\""
+    ipc: host
+
+  audioqna-backend-server:
+    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
+    container_name: audioqna-backend-server
+    depends_on:
+      - whisper-service
+      - audioqna-vllm-service
+      - speecht5-service
+    ports:
+      - "${BACKEND_SERVICE_PORT:-3008}:8888"
+    environment:
+      no_proxy: ${no_proxy}
+      https_proxy: ${https_proxy}
+      http_proxy: ${http_proxy}
+      MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP}
+      WHISPER_SERVER_HOST_IP: ${WHISPER_SERVER_HOST_IP}
+      WHISPER_SERVER_PORT: ${WHISPER_SERVER_PORT}
+      LLM_SERVER_HOST_IP: ${LLM_SERVER_HOST_IP}
+      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      SPEECHT5_SERVER_HOST_IP: ${SPEECHT5_SERVER_HOST_IP}
+      SPEECHT5_SERVER_PORT: ${SPEECHT5_SERVER_PORT}
+    ipc: host
+    restart: always
+
+  audioqna-ui-server:
+    image: ${REGISTRY:-opea}/audioqna-ui:${TAG:-latest}
+    container_name: audioqna-ui-server
+    depends_on:
+      - audioqna-backend-server
+    ports:
+      - "${FRONTEND_SERVICE_PORT:-5173}:5173"
+    environment:
+      no_proxy: ${no_proxy}
+      https_proxy: ${https_proxy}
+      http_proxy: ${http_proxy}
+      CHAT_URL: ${BACKEND_SERVICE_ENDPOINT}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/AudioQnA/docker_compose/amd/gpu/rocm/set_env_vllm.sh
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/set_env_vllm.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash                                                                                                           set_env.sh
+
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+
+# export host_ip=<your External Public IP>    # export host_ip=$(hostname -I | awk '{print $1}')
+
+export host_ip=""
+export external_host_ip=""
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export HF_CACHE_DIR="./data"
+export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export VLLM_SERVICE_PORT="8081"
+
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export WHISPER_SERVER_HOST_IP=${host_ip}
+export SPEECHT5_SERVER_HOST_IP=${host_ip}
+export LLM_SERVER_HOST_IP=${host_ip}
+
+export WHISPER_SERVER_PORT=7066
+export SPEECHT5_SERVER_PORT=7055
+export LLM_SERVER_PORT=${VLLM_SERVICE_PORT}
+export BACKEND_SERVICE_PORT=18038
+export FRONTEND_SERVICE_PORT=18039
+
+export BACKEND_SERVICE_ENDPOINT=http://${external_host_ip}:${BACKEND_SERVICE_PORT}/v1/audioqna
--- a/AudioQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/README.md
@@ -2,6 +2,10 @@

 This document outlines the deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server.

+The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [Start the MegaService](#-start-the-megaservice) section in this page.
+
+Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
+
 ## 🚀 Build Docker images

 ### 1. Source Code install GenAIComps
@@ -17,14 +21,23 @@ cd GenAIComps
 docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .
 ```

-### 3. Build LLM Image
+### 3. Build vLLM Image

-Intel Xeon optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu (https://github.com/huggingface/text-generation-inference)
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd ./vllm/
+VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+git checkout ${VLLM_VER}
+docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.cpu -t opea/vllm:latest --shm-size=128g .
+```

 ### 4. Build TTS Image

 ```bash
 docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile .
+
+# multilang tts (optional)
+docker build -t opea/gpt-sovits:latest --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -f comps/tts/src/integrations/dependency/gpt-sovits/Dockerfile .
 ```

 ### 5. Build MegaService Docker Image
@@ -40,8 +53,10 @@ docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_p
 Then run the command `docker images`, you will have following images ready:

 1. `opea/whisper:latest`
-2. `opea/speecht5:latest`
-3. `opea/audioqna:latest`
+2. `opea/vllm:latest`
+3. `opea/speecht5:latest`
+4. `opea/audioqna:latest`
+5. `opea/gpt-sovits:latest` (optional)

 ## 🚀 Set the environment variables

@@ -51,15 +66,17 @@ Before starting the services with `docker compose`, you have to recheck the foll
 export host_ip=<your External Public IP>    # export host_ip=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=<your HF token>

-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"

 export MEGA_SERVICE_HOST_IP=${host_ip}
 export WHISPER_SERVER_HOST_IP=${host_ip}
 export SPEECHT5_SERVER_HOST_IP=${host_ip}
 export LLM_SERVER_HOST_IP=${host_ip}
+export GPT_SOVITS_SERVER_HOST_IP=${host_ip}

 export WHISPER_SERVER_PORT=7066
 export SPEECHT5_SERVER_PORT=7055
+export GPT_SOVITS_SERVER_PORT=9880
 export LLM_SERVER_PORT=3006

 export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
@@ -67,37 +84,90 @@ export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna

 or use set_env.sh file to setup environment variables.

-Note: Please replace with host_ip with your external IP address, do not use localhost.
+Note:
+
+- Please replace with host_ip with your external IP address, do not use localhost.
+- If you are in a proxy environment, also set the proxy-related environment variables:
+
+```
+export http_proxy="Your_HTTP_Proxy"
+export https_proxy="Your_HTTPs_Proxy"
+# Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+export no_proxy="Your_No_Proxy",${host_ip},whisper-service,speecht5-service,gpt-sovits-service,tgi-service,vllm-service,audioqna-xeon-backend-server,audioqna-xeon-ui-server
+```

 ## 🚀 Start the MegaService

 ```bash
 cd GenAIExamples/AudioQnA/docker_compose/intel/cpu/xeon/
+```
+
+If use vLLM as the LLM serving backend:
+
+```
 docker compose up -d
+
+# multilang tts (optional)
+docker compose -f compose_multilang.yaml up -d
+```
+
+If use TGI as the LLM serving backend:
+
+```
+docker compose -f compose_tgi.yaml up -d
 ```

 ## 🚀 Test MicroServices

-```bash
-# whisper service
-curl http://${host_ip}:7066/v1/asr \
-  -X POST \
-  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
-  -H 'Content-Type: application/json'
+1. Whisper Service

-# tgi service
-curl http://${host_ip}:3006/generate \
-  -X POST \
-  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-  -H 'Content-Type: application/json'
+   ```bash
+   wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
+   curl http://${host_ip}:${WHISPER_SERVER_PORT}/v1/audio/transcriptions \
+     -H "Content-Type: multipart/form-data" \
+     -F file="@./sample.wav" \
+     -F model="openai/whisper-small"
+   ```

-# speecht5 service
-curl http://${host_ip}:7055/v1/tts \
-  -X POST \
-  -d '{"text": "Who are you?"}' \
-  -H 'Content-Type: application/json'
+2. LLM backend Service

-```
+   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready and the container (`vllm-service` or `tgi-service`) status shown via `docker ps` will be `healthy`. Before that, the status will be `health: starting`.
+
+   Or try the command below to check whether the LLM serving is ready.
+
+   ```bash
+   # vLLM service
+   docker logs vllm-service 2>&1 | grep complete
+   # If the service is ready, you will get the response like below.
+   INFO:     Application startup complete.
+   ```
+
+   ```bash
+   # TGI service
+   docker logs tgi-service | grep Connected
+   # If the service is ready, you will get the response like below.
+   2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
+   ```
+
+   Then try the `cURL` command below to validate services.
+
+   ```bash
+   # either vLLM or TGI service
+   curl http://${host_ip}:${LLM_SERVER_PORT}/v1/chat/completions \
+     -X POST \
+     -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
+     -H 'Content-Type: application/json'
+   ```
+
+3. TTS Service
+
+   ```
+   # speecht5 service
+   curl http://${host_ip}:${SPEECHT5_SERVER_PORT}/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
+
+   # gpt-sovits service (optional)
+   curl http://${host_ip}:${GPT_SOVITS_SERVER_PORT}/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
+   ```

 ## 🚀 Test MegaService

@@ -106,7 +176,8 @@ base64 string to the megaservice endpoint. The megaservice will return a spoken
 to the response, decode the base64 string and save it as a .wav file.

 ```bash
-# voice can be "default" or "male"
+# if you are using speecht5 as the tts service, voice can be "default" or "male"
+# if you are using gpt-sovits for the tts service, you can set the reference audio following https://github.com/opea-project/GenAIComps/blob/main/comps/tts/src/integrations/dependency/gpt-sovits/README.md
 curl http://${host_ip}:3008/v1/audioqna \
  -X POST \
  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64, "voice":"default"}' \
--- a/AudioQnA/docker_compose/intel/cpu/xeon/README_vllm.md
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/README_vllm.md
@@ -0,0 +1,256 @@
+Copyright (C) 2025 Advanced Micro Devices, Inc.
+
+# Deploy AudioQnA application
+
+## 1. Clone repo and build Docker images
+
+### 1.1. Cloning repo
+
+Create an empty directory in home directory and navigate to it:
+
+```bash
+mkdir -p ~/audioqna-test && cd ~/audioqna-test
+```
+
+Cloning GenAIExamples repo for build Docker images:
+
+```bash
+git clone https://github.com/opea-project/GenAIExamples.git
+```
+
+### 1.2. Navigate to repo directory and switching to the desired version of the code:
+
+If you are using the main branch, then you do not need to make the transition, the main branch is used by default
+
+```bash
+cd ~/searchqna-test/GenAIExamples/SearchQnA/docker_image_build
+git clone https://github.com/opea-project/GenAIComps.git
+```
+
+If you are using a specific branch or tag, then we perform git checkout to the desired version.
+
+```bash
+### Replace "v1.2" with the code version you need (branch or tag)
+cd cd ~/searchqna-test/GenAIExamples/SearchQnA/docker_image_build && git checkout v1.2
+git clone https://github.com/opea-project/GenAIComps.git
+```
+
+### 1.3. Build Docker images repo
+
+#### Build Docker image:
+
+```bash
+service_list="audioqna audioqna-ui whisper speecht5 vllm-rocm"
+docker compose -f build.yaml build --no-cache
+```
+
+### 1.4. Checking for the necessary Docker images
+
+After assembling the images, you can check their presence in the list of available images using the command:
+
+```bash
+docker image ls
+```
+
+The output of the command should contain images:
+
+- opea/whisper:latest
+- opea/speecht5:latest
+- opea/vllm-rocm:latest
+- opea/audioqna:latest
+- opea/audioqna-ui:latest
+
+## 2. Set deploy environment variables
+
+### Setting variables in the operating system environment
+
+#### Set variables:
+
+```bash
+### Replace the string 'your_huggingfacehub_token' with your HuggingFacehub repository access token.
+export HUGGINGFACEHUB_API_TOKEN='your_huggingfacehub_token'
+```
+
+### Setting variables in the file set_env_vllm.sh
+
+```bash
+cd cd cd ~/searchqna-test/GenAIExamples/SearchQnA/docker_compose/amd/gpu/rocm
+### The example uses the Nano text editor. You can use any convenient text editor
+nano set_env_vllm.sh
+```
+
+Set the values of the variables:
+
+- **host_ip, external_host_ip** - These variables are used to configure the name/address of the service in the operating system environment for the application services to interact with each other and with the outside world.
+
+  If your server uses only an internal address and is not accessible from the Internet, then the values for these two variables will be the same and the value will be equal to the server's internal name/address.
+
+  If your server uses only an external, Internet-accessible address, then the values for these two variables will be the same and the value will be equal to the server's external name/address.
+
+  If your server is located on an internal network, has an internal address, but is accessible from the Internet via a proxy/firewall/load balancer, then the host_ip variable will have a value equal to the internal name/address of the server, and the external_host_ip variable will have a value equal to the external name/address of the proxy/firewall/load balancer behind which the server is located.
+
+  We set these values in the file set_env_vllm.sh
+
+- **Variables with names like "%%%%\_PORT"** - These variables set the IP port numbers for establishing network connections to the application services.
+  The values shown in the file set_env_vllm.sh they are the values used for the development and testing of the application, as well as configured for the environment in which the development is performed. These values must be configured in accordance with the rules of network access to your environment's server, and must not overlap with the IP ports of other applications that are already in use.
+
+If you are in a proxy environment, also set the proxy-related environment variables:
+
+```bash
+export http_proxy="Your_HTTP_Proxy"
+export https_proxy="Your_HTTPs_Proxy"
+```
+
+## 3. Deploy application
+
+### 3.1. Deploying applications using Docker Compose
+
+```bash
+cd cd ~/audioqna-test/GenAIExamples/AudioQnA/docker_compose/amd/gpu/rocm/
+docker compose -f compose_vllm up -d
+```
+
+After starting the containers, you need to view their status with the command:
+
+```bash
+docker ps
+```
+
+The following containers should be running:
+
+- whisper-service
+- speecht5-service
+- audioqna-vllm-service
+- audioqna-backend-server
+- audioqna-ui-server
+
+Containers should not restart.
+
+#### 3.1.1. Configuring GPU forwarding
+
+By default, in the Docker Compose file, compose_vllm.yaml is configured to forward all GPUs to the audioqna-vllm-service container.
+To use certain GPUs, you need to configure the forwarding of certain devices from the host system to the container.
+The configuration must be done in:
+
+```yaml
+services:
+  #######
+  audioqna-vllm-service:
+    devices:
+```
+
+Example for set isolation for 1 GPU
+
+```
+      - /dev/dri/card0:/dev/dri/card0
+      - /dev/dri/renderD128:/dev/dri/renderD128
+```
+
+Example for set isolation for 2 GPUs
+
+```
+      - /dev/dri/card0:/dev/dri/card0
+      - /dev/dri/renderD128:/dev/dri/renderD128
+      - /dev/dri/card1:/dev/dri/card1
+      - /dev/dri/renderD129:/dev/dri/renderD129
+```
+
+### 3.2. Checking the application services
+
+#### 3.2.1. Checking audioqna-vllm-service
+
+Verification is performed in two ways:
+
+- Checking the container logs
+
+  ```bash
+  docker logs audioqna-vllm-service
+  ```
+
+  A message like this should appear in the logs:
+
+  ```textmate
+  INFO:     Started server process [1]
+  INFO:     Waiting for application startup.
+  INFO:     Application startup complete.
+  INFO:     Uvicorn running on http://0.0.0.0:8011 (Press CTRL+C to quit)
+  ```
+
+- Сhecking the response from the service
+  ```bash
+  ### curl request
+  ### Replace 18110 with the value set in the startup script in the variable VLLM_SERVICE_PORT
+  curl http://${host_ip}:${VLLM_SERVICE_PORT}/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+      "model": "Intel/neural-chat-7b-v3-3",
+      "prompt": "What is a Deep Learning?",
+      "max_tokens": 30,
+      "temperature": 0
+  }'
+  ```
+  The response from the service must be in the form of JSON:
+  ```json
+  {
+    "id": "cmpl-1d7d175d36d0491cba3abaa8b5bd6991",
+    "object": "text_completion",
+    "created": 1740411135,
+    "model": "Intel/neural-chat-7b-v3-3",
+    "choices": [
+      {
+        "index": 0,
+        "text": " Deep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is called \"deep\" because it",
+        "logprobs": null,
+        "finish_reason": "length",
+        "stop_reason": null,
+        "prompt_logprobs": null
+      }
+    ],
+    "usage": { "prompt_tokens": 7, "total_tokens": 37, "completion_tokens": 30, "prompt_tokens_details": null }
+  }
+  ```
+  The value of "choice.text" must contain a response from the service that makes sense.
+  If such a response is present, then the search-vllm-service is considered verified.
+
+#### 3.2.2. Checking whisper-service
+
+Сhecking the response from the service
+
+```bash
+wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
+curl http://${host_ip}:${WHISPER_SERVER_PORT}/v1/audio/transcriptions \
+  -H "Content-Type: multipart/form-data" \
+  -F file="@./sample.wav" \
+  -F model="openai/whisper-small"
+```
+
+The response from the service must be in the form of JSON:
+
+```json
+{ "text": "who is pat gelsinger" }
+```
+
+If the value of the text key is "who is pat gelsinger", then we consider the service to be successfully launched.
+
+#### 3.2.3. Checking speecht5-service
+
+Сhecking the response from the service
+
+```bash
+curl http://${host_ip}:${SPEECHT5_SERVER_PORT}/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
+```
+
+The result of the request is a speech.mp3 file. If you hear the phrase "Who are you?" while listening to the file, the service is considered successfully launched
+
+#### 3.2.4. Checking audioqna-backend-server
+
+Сhecking the response from the service
+
+```bash
+curl http://${host_ip}:${BACKEND_SERVICE_PORT}/v1/audioqna \
+  -X POST \
+  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64, "voice":"default"}' \
+  -H 'Content-Type: application/json' | sed 's/^"//;s/"$//' | base64 -d > output.wav
+```
+
+The result of the request is the output.wav file. If, when listening to it, you hear the answer that it is an assistant and a request for a new question, then the service is considered started.
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -6,7 +6,7 @@ services:
    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
    container_name: whisper-service
    ports:
-      - "7066:7066"
+      - ${WHISPER_SERVER_PORT:-7066}:7066
    ipc: host
    environment:
      no_proxy: ${no_proxy}
@@ -17,38 +17,41 @@ services:
    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
    container_name: speecht5-service
    ports:
-      - "7055:7055"
+      - ${SPEECHT5_SERVER_PORT:-7055}:7055
    ipc: host
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
    restart: unless-stopped
-  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
-    container_name: tgi-service
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+    container_name: vllm-service
    ports:
-      - "3006:80"
+      - ${LLM_SERVER_PORT:-3006}:80
    volumes:
-      - "./data:/data"
-    shm_size: 1g
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
+    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://$host_ip:3006/health || exit 1"]
+      test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
-    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+    command: --model ${LLM_MODEL_ID} --host 0.0.0.0 --port 80
  audioqna-xeon-backend-server:
    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
    container_name: audioqna-xeon-backend-server
    depends_on:
      - whisper-service
-      - tgi-service
+      - vllm-service
      - speecht5-service
    ports:
      - "3008:8888"
@@ -61,6 +64,7 @@ services:
      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
+      - LLM_MODEL_ID=${LLM_MODEL_ID}
      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
    ipc: host
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
@@ -6,7 +6,7 @@ services:
    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
    container_name: whisper-service
    ports:
-      - "7066:7066"
+      - ${WHISPER_SERVER_PORT:-7066}:7066
    ipc: host
    environment:
      no_proxy: ${no_proxy}
@@ -18,27 +18,35 @@ services:
    image: ${REGISTRY:-opea}/gpt-sovits:${TAG:-latest}
    container_name: gpt-sovits-service
    ports:
-      - "9880:9880"
+      - ${GPT_SOVITS_SERVER_PORT:-9880}:9880
    ipc: host
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
    restart: unless-stopped
-  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
-    container_name: tgi-service
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+    container_name: vllm-service
    ports:
-      - "3006:80"
+      - ${LLM_SERVER_PORT:-3006}:80
    volumes:
-      - "./data:/data"
-    shm_size: 1g
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
+    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model ${LLM_MODEL_ID} --host 0.0.0.0 --port 80
  audioqna-xeon-backend-server:
    image: ${REGISTRY:-opea}/audioqna-multilang:${TAG:-latest}
    container_name: audioqna-xeon-backend-server
@@ -58,7 +66,20 @@ services:
      - GPT_SOVITS_SERVER_PORT=${GPT_SOVITS_SERVER_PORT}
    ipc: host
    restart: always
-
+  audioqna-xeon-ui-server:
+    image: ${REGISTRY:-opea}/audioqna-ui:${TAG:-latest}
+    container_name: audioqna-xeon-ui-server
+    depends_on:
+      - audioqna-xeon-backend-server
+    ports:
+      - "5175:5173"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - CHAT_URL=${BACKEND_SERVICE_ENDPOINT}
+    ipc: host
+    restart: always
 networks:
  default:
    driver: bridge
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -0,0 +1,87 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  whisper-service:
+    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+    container_name: whisper-service
+    ports:
+      - ${WHISPER_SERVER_PORT:-7066}:7066
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+  speecht5-service:
+    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
+    container_name: speecht5-service
+    ports:
+      - ${SPEECHT5_SERVER_PORT:-7055}:7055
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+  tgi-service:
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    container_name: tgi-service
+    ports:
+      - ${LLM_SERVER_PORT:-3006}:80
+    volumes:
+      - "${MODEL_CACHE:-./data}:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+  audioqna-xeon-backend-server:
+    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
+    container_name: audioqna-xeon-backend-server
+    depends_on:
+      - whisper-service
+      - tgi-service
+      - speecht5-service
+    ports:
+      - "3008:8888"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+      - WHISPER_SERVER_HOST_IP=${WHISPER_SERVER_HOST_IP}
+      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
+      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
+      - LLM_MODEL_ID=${LLM_MODEL_ID}
+      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
+      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
+    ipc: host
+    restart: always
+  audioqna-xeon-ui-server:
+    image: ${REGISTRY:-opea}/audioqna-ui:${TAG:-latest}
+    container_name: audioqna-xeon-ui-server
+    depends_on:
+      - audioqna-xeon-backend-server
+    ports:
+      - "5173:5173"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - CHAT_URL=${BACKEND_SERVICE_ENDPOINT}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/AudioQnA/docker_compose/intel/cpu/xeon/set_env.sh
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/set_env.sh
@@ -8,7 +8,7 @@ export host_ip=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 # <token>

-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"

 export MEGA_SERVICE_HOST_IP=${host_ip}
 export WHISPER_SERVER_HOST_IP=${host_ip}
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -2,6 +2,10 @@

 This document outlines the deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server.

+The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [Start the MegaService](#-start-the-megaservice) section in this page.
+
+Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
+
 ## 🚀 Build Docker images

 ### 1. Source Code install GenAIComps
@@ -17,9 +21,13 @@ cd GenAIComps
 docker build -t opea/whisper-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile.intel_hpu .
 ```

-### 3. Build LLM Image
+### 3. Build vLLM Image

-Intel Xeon optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/tgi-gaudi:2.0.6 (https://github.com/huggingface/tgi-gaudi)
+git clone https://github.com/HabanaAI/vllm-fork.git
+cd vllm-fork/
+VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+git checkout ${VLLM_VER}
+docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g .

 ### 4. Build TTS Image

@@ -40,8 +48,9 @@ docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_p
 Then run the command `docker images`, you will have following images ready:

 1. `opea/whisper-gaudi:latest`
-2. `opea/speecht5-gaudi:latest`
-3. `opea/audioqna:latest`
+2. `opea/vllm-gaudi:latest`
+3. `opea/speecht5-gaudi:latest`
+4. `opea/audioqna:latest`

 ## 🚀 Set the environment variables

@@ -51,7 +60,12 @@ Before starting the services with `docker compose`, you have to recheck the foll
 export host_ip=<your External Public IP>    # export host_ip=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=<your HF token>

-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+# set vLLM parameters
+export NUM_CARDS=1
+export BLOCK_SIZE=128
+export MAX_NUM_SEQS=256
+export MAX_SEQ_LEN_TO_CAPTURE=2048

 export MEGA_SERVICE_HOST_IP=${host_ip}
 export WHISPER_SERVER_HOST_IP=${host_ip}
@@ -65,37 +79,90 @@ export LLM_SERVER_PORT=3006
 export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
 ```

+or use set_env.sh file to setup environment variables.
+
+Note:
+
+- Please replace with host_ip with your external IP address, do not use localhost.
+- If you are in a proxy environment, also set the proxy-related environment variables:
+
+```
+export http_proxy="Your_HTTP_Proxy"
+export https_proxy="Your_HTTPs_Proxy"
+# Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+export no_proxy="Your_No_Proxy",${host_ip},whisper-service,speecht5-service,tgi-service,vllm-service,audioqna-gaudi-backend-server,audioqna-gaudi-ui-server
+```
+
 ## 🚀 Start the MegaService

 > **_NOTE:_** Users will need at least three Gaudi cards for AudioQnA.

 ```bash
 cd GenAIExamples/AudioQnA/docker_compose/intel/hpu/gaudi/
+```
+
+If use vLLM as the LLM serving backend:
+
+```
 docker compose up -d
 ```

+If use TGI as the LLM serving backend:
+
+```
+docker compose -f compose_tgi.yaml up -d
+```
+
 ## 🚀 Test MicroServices

-```bash
-# whisper service
-curl http://${host_ip}:7066/v1/asr \
-  -X POST \
-  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
-  -H 'Content-Type: application/json'
+1. Whisper Service

-# tgi service
-curl http://${host_ip}:3006/generate \
-  -X POST \
-  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-  -H 'Content-Type: application/json'
+   ```bash
+   curl http://${host_ip}:${WHISPER_SERVER_PORT}/v1/asr \
+     -X POST \
+     -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
+     -H 'Content-Type: application/json'
+   ```

-# speecht5 service
-curl http://${host_ip}:7055/v1/tts \
-  -X POST \
-  -d '{"text": "Who are you?"}' \
-  -H 'Content-Type: application/json'
+2. LLM backend Service

-```
+   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready and the container (`vllm-gaudi-service` or `tgi-gaudi-service`) status shown via `docker ps` will be `healthy`. Before that, the status will be `health: starting`.
+
+   Or try the command below to check whether the LLM serving is ready.
+
+   ```bash
+   # vLLM service
+   docker logs vllm-gaudi-service 2>&1 | grep complete
+   # If the service is ready, you will get the response like below.
+   INFO:     Application startup complete.
+   ```
+
+   ```bash
+   # TGI service
+   docker logs tgi-gaudi-service | grep Connected
+   # If the service is ready, you will get the response like below.
+   2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
+   ```
+
+   Then try the `cURL` command below to validate services.
+
+   ```bash
+   # either vLLM or TGI service
+   curl http://${host_ip}:${LLM_SERVER_PORT}/v1/chat/completions \
+     -X POST \
+     -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
+     -H 'Content-Type: application/json'
+   ```
+
+3. TTS Service
+
+   ```
+   # speecht5 service
+   curl http://${host_ip}:${SPEECHT5_SERVER_PORT}/v1/tts
+     -X POST \
+     -d '{"text": "Who are you?"}' \
+     -H 'Content-Type: application/json'
+   ```

 ## 🚀 Test MegaService

--- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -6,7 +6,7 @@ services:
    image: ${REGISTRY:-opea}/whisper-gaudi:${TAG:-latest}
    container_name: whisper-service
    ports:
-      - "7066:7066"
+      - ${WHISPER_SERVER_PORT:-7066}:7066
    ipc: host
    environment:
      no_proxy: ${no_proxy}
@@ -22,7 +22,7 @@ services:
    image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest}
    container_name: speecht5-service
    ports:
-      - "7055:7055"
+      - ${SPEECHT5_SERVER_PORT:-7055}:7055
    ipc: host
    environment:
      no_proxy: ${no_proxy}
@@ -34,28 +34,27 @@ services:
    cap_add:
      - SYS_NICE
    restart: unless-stopped
-  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
-    container_name: tgi-gaudi-server
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+    container_name: vllm-gaudi-service
    ports:
-      - "3006:80"
+      - ${LLM_SERVER_PORT:-3006}:80
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://$host_ip:3006/health || exit 1"]
+      test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
@@ -63,13 +62,13 @@ services:
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
  audioqna-gaudi-backend-server:
    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
    container_name: audioqna-gaudi-backend-server
    depends_on:
      - whisper-service
-      - tgi-service
+      - vllm-service
      - speecht5-service
    ports:
      - "3008:8888"
@@ -82,6 +81,7 @@ services:
      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
+      - LLM_MODEL_ID=${LLM_MODEL_ID}
      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
    ipc: host
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -0,0 +1,108 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  whisper-service:
+    image: ${REGISTRY:-opea}/whisper-gaudi:${TAG:-latest}
+    container_name: whisper-service
+    ports:
+      - ${WHISPER_SERVER_PORT:-7066}:7066
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    restart: unless-stopped
+  speecht5-service:
+    image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest}
+    container_name: speecht5-service
+    ports:
+      - ${SPEECHT5_SERVER_PORT:-7055}:7055
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    restart: unless-stopped
+  tgi-service:
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
+    container_name: tgi-gaudi-service
+    ports:
+      - ${LLM_SERVER_PORT:-3006}:80
+    volumes:
+      - "${MODEL_CACHE:-./data}:/data"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      ENABLE_HPU_GRAPH: true
+      LIMIT_HPU_GRAPH: true
+      USE_FLASH_ATTENTION: true
+      FLASH_ATTENTION_RECOMPUTE: true
+      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
+  audioqna-gaudi-backend-server:
+    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
+    container_name: audioqna-gaudi-backend-server
+    depends_on:
+      - whisper-service
+      - tgi-service
+      - speecht5-service
+    ports:
+      - "3008:8888"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+      - WHISPER_SERVER_HOST_IP=${WHISPER_SERVER_HOST_IP}
+      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
+      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
+      - LLM_MODEL_ID=${LLM_MODEL_ID}
+      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
+      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
+    ipc: host
+    restart: always
+  audioqna-gaudi-ui-server:
+    image: ${REGISTRY:-opea}/audioqna-ui:${TAG:-latest}
+    container_name: audioqna-gaudi-ui-server
+    depends_on:
+      - audioqna-gaudi-backend-server
+    ports:
+      - "5173:5173"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - CHAT_URL=${BACKEND_SERVICE_ENDPOINT}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -8,7 +8,13 @@ export host_ip=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 # <token>

-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+
+# set vLLM parameters
+export NUM_CARDS=1
+export BLOCK_SIZE=128
+export MAX_NUM_SEQS=256
+export MAX_SEQ_LEN_TO_CAPTURE=2048

 export MEGA_SERVICE_HOST_IP=${host_ip}
 export WHISPER_SERVER_HOST_IP=${host_ip}
--- a/AudioQnA/docker_image_build/build.yaml
+++ b/AudioQnA/docker_image_build/build.yaml
@@ -71,3 +71,24 @@ services:
      dockerfile: comps/tts/src/integrations/dependency/gpt-sovits/Dockerfile
    extends: audioqna
    image: ${REGISTRY:-opea}/gpt-sovits:${TAG:-latest}
+  vllm:
+    build:
+      context: vllm
+      dockerfile: Dockerfile.cpu
+    extends: audioqna
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+  vllm-gaudi:
+    build:
+      context: vllm-fork
+      dockerfile: Dockerfile.hpu
+    extends: audioqna
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+  vllm-rocm:
+    build:
+      args:
+        http_proxy: ${http_proxy}
+        https_proxy: ${https_proxy}
+        no_proxy: ${no_proxy}
+      context: GenAIComps
+      dockerfile: comps/third_parties/vllm/src/Dockerfile.amd_gpu
+    image: ${REGISTRY:-opea}/vllm-rocm:${TAG:-latest}
--- a/AudioQnA/tests/test_compose_on_gaudi.sh
+++ b/AudioQnA/tests/test_compose_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -30,18 +31,27 @@ function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git

+    git clone https://github.com/HabanaAI/vllm-fork.git
+    cd vllm-fork/
+    VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+    echo "Check out vLLM tag ${VLLM_VER}"
+    git checkout ${VLLM_VER} &> /dev/null && cd ../
+
    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi"
+    service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi vllm-gaudi"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
    docker images && sleep 1s
 }

 function start_services() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+    export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
+    export NUM_CARDS=1
+    export BLOCK_SIZE=128
+    export MAX_NUM_SEQS=256
+    export MAX_SEQ_LEN_TO_CAPTURE=2048

    export MEGA_SERVICE_HOST_IP=${ip_address}
    export WHISPER_SERVER_HOST_IP=${ip_address}
@@ -60,8 +70,8 @@ function start_services() {
    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
    until [[ "$n" -ge 200 ]]; do
-       docker logs tgi-gaudi-server > $LOG_PATH/tgi_service_start.log
-       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+       docker logs vllm-gaudi-service > $LOG_PATH/vllm_service_start.log 2>&1
+       if grep -q complete $LOG_PATH/vllm_service_start.log; then
           break
       fi
       sleep 5s
@@ -85,7 +95,7 @@ function validate_megaservice() {
    # always print the log
    docker logs whisper-service > $LOG_PATH/whisper-service.log
    docker logs speecht5-service > $LOG_PATH/tts-service.log
-    docker logs tgi-gaudi-server > $LOG_PATH/tgi-gaudi-server.log
+    docker logs vllm-gaudi-service > $LOG_PATH/vllm-gaudi-service.log
    docker logs audioqna-gaudi-backend-server > $LOG_PATH/audioqna-gaudi-backend-server.log
    echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3

@@ -125,7 +135,7 @@ function validate_megaservice() {

 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    docker compose stop && docker compose rm -f
+    docker compose -f compose.yaml stop && docker compose rm -f
 }

 function main() {
--- a/AudioQnA/tests/test_compose_on_xeon.sh
+++ b/AudioQnA/tests/test_compose_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -30,18 +31,23 @@ function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git

+    git clone https://github.com/vllm-project/vllm.git
+    cd ./vllm/
+    VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+    echo "Check out vLLM tag ${VLLM_VER}"
+    git checkout ${VLLM_VER} &> /dev/null && cd ../
+
    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="audioqna audioqna-ui whisper speecht5"
+    service_list="audioqna audioqna-ui whisper speecht5 vllm"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
    docker images && sleep 1s
 }

 function start_services() {
    cd $WORKPATH/docker_compose/intel/cpu/xeon/
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+    export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct

    export MEGA_SERVICE_HOST_IP=${ip_address}
    export WHISPER_SERVER_HOST_IP=${ip_address}
@@ -61,8 +67,8 @@ function start_services() {
    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
    until [[ "$n" -ge 200 ]]; do
-       docker logs tgi-service > $LOG_PATH/tgi_service_start.log
-       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+       docker logs vllm-service > $LOG_PATH/vllm_service_start.log 2>&1
+       if grep -q complete $LOG_PATH/vllm_service_start.log; then
           break
       fi
       sleep 5s
@@ -76,7 +82,7 @@ function validate_megaservice() {
    # always print the log
    docker logs whisper-service > $LOG_PATH/whisper-service.log
    docker logs speecht5-service > $LOG_PATH/tts-service.log
-    docker logs tgi-service > $LOG_PATH/tgi-service.log
+    docker logs vllm-service > $LOG_PATH/vllm-service.log
    docker logs audioqna-xeon-backend-server > $LOG_PATH/audioqna-xeon-backend-server.log
    echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3

@@ -116,7 +122,7 @@ function validate_megaservice() {

 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/cpu/xeon/
-    docker compose stop && docker compose rm -f
+    docker compose -f compose.yaml stop && docker compose rm -f
 }

 function main() {
--- a/AudioQnA/tests/test_compose_tgi_on_gaudi.sh
+++ b/AudioQnA/tests/test_compose_tgi_on_gaudi.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi
+
+    cd $WORKPATH/docker_image_build
+    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+    docker images && sleep 1s
+}
+
+function start_services() {
+    cd $WORKPATH/docker_compose/intel/hpu/gaudi
+    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+    export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
+
+    export MEGA_SERVICE_HOST_IP=${ip_address}
+    export WHISPER_SERVER_HOST_IP=${ip_address}
+    export SPEECHT5_SERVER_HOST_IP=${ip_address}
+    export LLM_SERVER_HOST_IP=${ip_address}
+
+    export WHISPER_SERVER_PORT=7066
+    export SPEECHT5_SERVER_PORT=7055
+    export LLM_SERVER_PORT=3006
+
+    export BACKEND_SERVICE_ENDPOINT=http://${ip_address}:3008/v1/audioqna
+    export host_ip=${ip_address}
+    # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+
+    # Start Docker Containers
+    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+    n=0
+    until [[ "$n" -ge 200 ]]; do
+       docker logs tgi-gaudi-service > $LOG_PATH/tgi_service_start.log
+       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+           break
+       fi
+       sleep 5s
+       n=$((n+1))
+    done
+
+    n=0
+    until [[ "$n" -ge 100 ]]; do
+       docker logs whisper-service > $LOG_PATH/whisper_service_start.log
+       if grep -q "Uvicorn server setup on port" $LOG_PATH/whisper_service_start.log; then
+           break
+       fi
+       sleep 5s
+       n=$((n+1))
+    done
+}
+
+
+function validate_megaservice() {
+    response=$(http_proxy="" curl http://${ip_address}:3008/v1/audioqna -XPOST -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64}' -H 'Content-Type: application/json')
+    # always print the log
+    docker logs whisper-service > $LOG_PATH/whisper-service.log
+    docker logs speecht5-service > $LOG_PATH/tts-service.log
+    docker logs tgi-gaudi-service > $LOG_PATH/tgi-gaudi-service.log
+    docker logs audioqna-gaudi-backend-server > $LOG_PATH/audioqna-gaudi-backend-server.log
+    echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
+
+    if [[ $(file speech.mp3) == *"RIFF"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong."
+        exit 1
+    fi
+
+}
+
+#function validate_frontend() {
+#    cd $WORKPATH/ui/svelte
+#    local conda_env_name="OPEA_e2e"
+#    export PATH=${HOME}/miniforge3/bin/:$PATH
+##    conda remove -n ${conda_env_name} --all -y
+##    conda create -n ${conda_env_name} python=3.12 -y
+#    source activate ${conda_env_name}
+#
+#    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+#
+##    conda install -c conda-forge nodejs=22.6.0 -y
+#    npm install && npm ci && npx playwright install --with-deps
+#    node -v && npm -v && pip list
+#
+#    exit_status=0
+#    npx playwright test || exit_status=$?
+#
+#    if [ $exit_status -ne 0 ]; then
+#        echo "[TEST INFO]: ---------frontend test failed---------"
+#        exit $exit_status
+#    else
+#        echo "[TEST INFO]: ---------frontend test passed---------"
+#    fi
+#}
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/intel/hpu/gaudi
+    docker compose -f compose_tgi.yaml stop && docker compose rm -f
+}
+
+function main() {
+
+    stop_docker
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_services
+
+    validate_megaservice
+    # validate_frontend
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/AudioQnA/tests/test_compose_tgi_on_xeon.sh
+++ b/AudioQnA/tests/test_compose_tgi_on_xeon.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi
+
+    cd $WORKPATH/docker_image_build
+    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="audioqna audioqna-ui whisper speecht5"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    docker images && sleep 1s
+}
+
+function start_services() {
+    cd $WORKPATH/docker_compose/intel/cpu/xeon/
+    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+    export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
+
+    export MEGA_SERVICE_HOST_IP=${ip_address}
+    export WHISPER_SERVER_HOST_IP=${ip_address}
+    export SPEECHT5_SERVER_HOST_IP=${ip_address}
+    export LLM_SERVER_HOST_IP=${ip_address}
+
+    export WHISPER_SERVER_PORT=7066
+    export SPEECHT5_SERVER_PORT=7055
+    export LLM_SERVER_PORT=3006
+
+    export BACKEND_SERVICE_ENDPOINT=http://${ip_address}:3008/v1/audioqna
+    export host_ip=${ip_address}
+
+    # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+
+    # Start Docker Containers
+    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+    n=0
+    until [[ "$n" -ge 200 ]]; do
+       docker logs tgi-service > $LOG_PATH/tgi_service_start.log
+       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+           break
+       fi
+       sleep 5s
+       n=$((n+1))
+    done
+}
+
+
+function validate_megaservice() {
+    response=$(http_proxy="" curl http://${ip_address}:3008/v1/audioqna -XPOST -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64}' -H 'Content-Type: application/json')
+    # always print the log
+    docker logs whisper-service > $LOG_PATH/whisper-service.log
+    docker logs speecht5-service > $LOG_PATH/tts-service.log
+    docker logs tgi-service > $LOG_PATH/tgi-service.log
+    docker logs audioqna-xeon-backend-server > $LOG_PATH/audioqna-xeon-backend-server.log
+    echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
+
+    if [[ $(file speech.mp3) == *"RIFF"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong."
+        exit 1
+    fi
+
+}
+
+#function validate_frontend() {
+#    cd $WORKPATH/ui/svelte
+#    local conda_env_name="OPEA_e2e"
+#    export PATH=${HOME}/miniforge3/bin/:$PATH
+##    conda remove -n ${conda_env_name} --all -y
+##    conda create -n ${conda_env_name} python=3.12 -y
+#    source activate ${conda_env_name}
+#
+#    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+#
+##    conda install -c conda-forge nodejs=22.6.0 -y
+#    npm install && npm ci && npx playwright install --with-deps
+#    node -v && npm -v && pip list
+#
+#    exit_status=0
+#    npx playwright test || exit_status=$?
+#
+#    if [ $exit_status -ne 0 ]; then
+#        echo "[TEST INFO]: ---------frontend test failed---------"
+#        exit $exit_status
+#    else
+#        echo "[TEST INFO]: ---------frontend test passed---------"
+#    fi
+#}
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/intel/cpu/xeon/
+    docker compose -f compose_tgi.yaml stop && docker compose rm -f
+}
+
+function main() {
+
+    stop_docker
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_services
+
+    validate_megaservice
+    # validate_frontend
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/AudioQnA/tests/test_compose_vllm_on_rocm.sh
+++ b/AudioQnA/tests/test_compose_vllm_on_rocm.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+export PATH="~/miniconda3/bin:$PATH"
+
+function build_docker_images() {
+    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi
+
+    cd $WORKPATH/docker_image_build
+    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="audioqna audioqna-ui whisper speecht5 vllm-rocm"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+    docker images && sleep 3s
+}
+
+function start_services() {
+    cd $WORKPATH/docker_compose/amd/gpu/rocm/
+
+    export host_ip=${ip_address}
+    export external_host_ip=${ip_address}
+    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+    export HF_CACHE_DIR="./data"
+    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+    export VLLM_SERVICE_PORT="8081"
+
+    export MEGA_SERVICE_HOST_IP=${host_ip}
+    export WHISPER_SERVER_HOST_IP=${host_ip}
+    export SPEECHT5_SERVER_HOST_IP=${host_ip}
+    export LLM_SERVER_HOST_IP=${host_ip}
+
+    export WHISPER_SERVER_PORT=7066
+    export SPEECHT5_SERVER_PORT=7055
+    export LLM_SERVER_PORT=${VLLM_SERVICE_PORT}
+    export BACKEND_SERVICE_PORT=3008
+    export FRONTEND_SERVICE_PORT=5173
+
+    export BACKEND_SERVICE_ENDPOINT=http://${external_host_ip}:${BACKEND_SERVICE_PORT}/v1/audioqna
+
+    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+
+    # Start Docker Containers
+    docker compose -f compose_vllm.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+    n=0
+    until [[ "$n" -ge 200 ]]; do
+       docker logs audioqna-vllm-service >& $LOG_PATH/vllm_service_start.log
+       if grep -q "Application startup complete" $LOG_PATH/vllm_service_start.log; then
+           break
+       fi
+       sleep 10s
+       n=$((n+1))
+    done
+}
+function validate_megaservice() {
+    response=$(http_proxy="" curl http://${ip_address}:${BACKEND_SERVICE_PORT}/v1/audioqna -XPOST -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64}' -H 'Content-Type: application/json')
+    # always print the log
+    docker logs whisper-service > $LOG_PATH/whisper-service.log
+    docker logs speecht5-service > $LOG_PATH/tts-service.log
+    docker logs audioqna-vllm-service > $LOG_PATH/audioqna-vllm-service.log
+    docker logs audioqna-backend-server > $LOG_PATH/audioqna-backend-server.log
+    echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
+
+    if [[ $(file speech.mp3) == *"RIFF"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong."
+        exit 1
+    fi
+
+}
+
+#function validate_frontend() {
+## Frontend tests are currently disabled
+#    cd $WORKPATH/ui/svelte
+#    local conda_env_name="OPEA_e2e"
+#    export PATH=${HOME}/miniforge3/bin/:$PATH
+##    conda remove -n ${conda_env_name} --all -y
+##    conda create -n ${conda_env_name} python=3.12 -y
+#    source activate ${conda_env_name}
+#
+#    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+#
+##    conda install -c conda-forge nodejs -y
+#    npm install && npm ci && npx playwright install --with-deps
+#    node -v && npm -v && pip list
+#
+#    exit_status=0
+#    npx playwright test || exit_status=$?
+#
+#    if [ $exit_status -ne 0 ]; then
+#        echo "[TEST INFO]: ---------frontend test failed---------"
+#        exit $exit_status
+#    else
+#        echo "[TEST INFO]: ---------frontend test passed---------"
+#    fi
+#}
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/amd/gpu/rocm/
+    docker compose -f compose_vllm.yaml stop && docker compose -f compose_vllm.yaml rm -f
+}
+
+function main() {
+
+    stop_docker
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_services
+
+    validate_megaservice
+    # Frontend tests are currently disabled
+    # validate_frontend
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/AvatarChatbot/Dockerfile
+++ b/AvatarChatbot/Dockerfile
@@ -32,7 +32,7 @@ COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
 COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/

 WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip && \
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
 WORKDIR $HOME

--- a/AvatarChatbot/docker_compose/amd/gpu/rocm/README.md
+++ b/AvatarChatbot/docker_compose/amd/gpu/rocm/README.md
@@ -0,0 +1,209 @@
+# Build Mega Service of AvatarChatbot on AMD GPU
+
+This document outlines the deployment process for a AvatarChatbot application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server.
+
+## 🚀 Build Docker images
+
+### 1. Source Code install GenAIComps
+
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+```
+
+### 2. Build ASR Image
+
+```bash
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .
+
+
+docker build -t opea/asr:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/Dockerfile .
+```
+
+### 3. Build LLM Image
+
+```bash
+docker build --no-cache -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
+```
+
+### 4. Build TTS Image
+
+```bash
+docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile .
+
+docker build -t opea/tts:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/Dockerfile .
+```
+
+### 5. Build Animation Image
+
+```bash
+docker build -t opea/wav2lip:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/wav2lip/src/Dockerfile .
+
+docker build -t opea/animation:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/animation/src/Dockerfile .
+```
+
+### 6. Build MegaService Docker Image
+
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `audioqna.py` Python script. Build the MegaService Docker image using the command below:
+
+```bash
+git clone https://github.com/opea-project/GenAIExamples.git
+cd GenAIExamples/AvatarChatbot/
+docker build --no-cache -t opea/avatarchatbot:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+
+Then run the command `docker images`, you will have following images ready:
+
+1. `opea/whisper:latest`
+2. `opea/asr:latest`
+3. `opea/llm-tgi:latest`
+4. `opea/speecht5:latest`
+5. `opea/tts:latest`
+6. `opea/wav2lip:latest`
+7. `opea/animation:latest`
+8. `opea/avatarchatbot:latest`
+
+## 🚀 Set the environment variables
+
+Before starting the services with `docker compose`, you have to recheck the following environment variables.
+
+```bash
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export host_ip=$(hostname -I | awk '{print $1}')
+
+export TGI_SERVICE_PORT=3006
+export TGI_LLM_ENDPOINT=http://${host_ip}:${TGI_SERVICE_PORT}
+export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+
+export ASR_ENDPOINT=http://${host_ip}:7066
+export TTS_ENDPOINT=http://${host_ip}:7055
+export WAV2LIP_ENDPOINT=http://${host_ip}:7860
+
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export ASR_SERVICE_HOST_IP=${host_ip}
+export TTS_SERVICE_HOST_IP=${host_ip}
+export LLM_SERVICE_HOST_IP=${host_ip}
+export ANIMATION_SERVICE_HOST_IP=${host_ip}
+
+export MEGA_SERVICE_PORT=8888
+export ASR_SERVICE_PORT=3001
+export TTS_SERVICE_PORT=3002
+export LLM_SERVICE_PORT=3007
+export ANIMATION_SERVICE_PORT=3008
+
+export DEVICE="cpu"
+export WAV2LIP_PORT=7860
+export INFERENCE_MODE='wav2lip+gfpgan'
+export CHECKPOINT_PATH='/usr/local/lib/python3.11/site-packages/Wav2Lip/checkpoints/wav2lip_gan.pth'
+export FACE="assets/img/avatar5.png"
+# export AUDIO='assets/audio/eg3_ref.wav' # audio file path is optional, will use base64str in the post request as input if is 'None'
+export AUDIO='None'
+export FACESIZE=96
+export OUTFILE="/outputs/result.mp4"
+export GFPGAN_MODEL_VERSION=1.4 # latest version, can roll back to v1.3 if needed
+export UPSCALE_FACTOR=1
+export FPS=10
+```
+
+Warning!!! - The Wav2lip service works in this solution using only the CPU. To use AMD GPUs and achieve operational performance, the Wav2lip image needs to be modified to adapt to AMD hardware and the ROCm framework.
+
+## 🚀 Start the MegaService
+
+```bash
+cd GenAIExamples/AvatarChatbot/docker_compose/intel/cpu/xeon/
+docker compose -f compose.yaml up -d
+```
+
+## 🚀 Test MicroServices
+
+```bash
+# whisper service
+curl http://${host_ip}:7066/v1/asr \
+  -X POST \
+  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
+  -H 'Content-Type: application/json'
+
+# asr microservice
+curl http://${host_ip}:3001/v1/audio/transcriptions \
+  -X POST \
+  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
+  -H 'Content-Type: application/json'
+
+# tgi service
+curl http://${host_ip}:3006/generate \
+  -X POST \
+  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
+  -H 'Content-Type: application/json'
+
+# llm microservice
+curl http://${host_ip}:3007/v1/chat/completions\
+  -X POST \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
+  -H 'Content-Type: application/json'
+
+# speecht5 service
+curl http://${host_ip}:7055/v1/tts \
+  -X POST \
+  -d '{"text": "Who are you?"}' \
+  -H 'Content-Type: application/json'
+
+# tts microservice
+curl http://${host_ip}:3002/v1/audio/speech \
+  -X POST \
+  -d '{"text": "Who are you?"}' \
+  -H 'Content-Type: application/json'
+
+# wav2lip service
+cd ../../../..
+curl http://${host_ip}:7860/v1/wav2lip \
+  -X POST \
+  -d @assets/audio/sample_minecraft.json \
+  -H 'Content-Type: application/json'
+
+# animation microservice
+curl http://${host_ip}:3008/v1/animation \
+  -X POST \
+  -d @assets/audio/sample_question.json \
+  -H "Content-Type: application/json"
+
+```
+
+## 🚀 Test MegaService
+
+```bash
+curl http://${host_ip}:3009/v1/avatarchatbot \
+  -X POST \
+  -d @assets/audio/sample_whoareyou.json \
+  -H 'Content-Type: application/json'
+```
+
+If the megaservice is running properly, you should see the following output:
+
+```bash
+"/outputs/result.mp4"
+```
+
+The output file will be saved in the current working directory, as `${PWD}` is mapped to `/outputs` inside the wav2lip-service Docker container.
+
+## Gradio UI
+
+```bash
+cd $WORKPATH/GenAIExamples/AvatarChatbot
+python3 ui/gradio/app_gradio_demo_avatarchatbot.py
+```
+
+The UI can be viewed at http://${host_ip}:7861  
+<img src="../../../../assets/img/UI.png" alt="UI Example" width="60%">  
+In the current version v1.0, you need to set the avatar figure image/video and the DL model choice in the environment variables before starting AvatarChatbot backend service and running the UI. Please just customize the audio question in the UI.  
+\*\* We will enable change of avatar figure between runs in v2.0
+
+## Troubleshooting
+
+```bash
+cd GenAIExamples/AvatarChatbot/tests
+export IMAGE_REPO="opea"
+export IMAGE_TAG="latest"
+export HUGGINGFACEHUB_API_TOKEN=<your_hf_token>
+
+test_avatarchatbot_on_xeon.sh
+```
--- a/AvatarChatbot/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AvatarChatbot/docker_compose/amd/gpu/rocm/compose.yaml
@@ -0,0 +1,158 @@
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  whisper-service:
+    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+    container_name: whisper-service
+    ports:
+      - "7066:7066"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+  asr:
+    image: ${REGISTRY:-opea}/asr:${TAG:-latest}
+    container_name: asr-service
+    ports:
+      - "3001:9099"
+    ipc: host
+    environment:
+      ASR_ENDPOINT: ${ASR_ENDPOINT}
+  speecht5-service:
+    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
+    container_name: speecht5-service
+    ports:
+      - "7055:7055"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+  tts:
+    image: ${REGISTRY:-opea}/tts:${TAG:-latest}
+    container_name: tts-service
+    ports:
+      - "3002:9088"
+    ipc: host
+    environment:
+      TTS_ENDPOINT: ${TTS_ENDPOINT}
+  tgi-service:
+    image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    container_name: tgi-service
+    ports:
+      - "${TGI_SERVICE_PORT:-3006}:80"
+    volumes:
+      - "./data:/data"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+    shm_size: 1g
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri/:/dev/dri/
+    cap_add:
+      - SYS_PTRACE
+    group_add:
+      - video
+    security_opt:
+      - seccomp:unconfined
+    ipc: host
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192
+  llm:
+    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
+    container_name: llm-tgi-server
+    depends_on:
+      - tgi-service
+    ports:
+      - "3007:9000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+      LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      OPENAI_API_KEY: ${OPENAI_API_KEY}
+    restart: unless-stopped
+  wav2lip-service:
+    image: ${REGISTRY:-opea}/wav2lip:${TAG:-latest}
+    container_name: wav2lip-service
+    ports:
+      - "7860:7860"
+    ipc: host
+    volumes:
+      - ${PWD}:/outputs
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      DEVICE: ${DEVICE}
+      INFERENCE_MODE: ${INFERENCE_MODE}
+      CHECKPOINT_PATH: ${CHECKPOINT_PATH}
+      FACE: ${FACE}
+      AUDIO: ${AUDIO}
+      FACESIZE: ${FACESIZE}
+      OUTFILE: ${OUTFILE}
+      GFPGAN_MODEL_VERSION: ${GFPGAN_MODEL_VERSION}
+      UPSCALE_FACTOR: ${UPSCALE_FACTOR}
+      FPS: ${FPS}
+      WAV2LIP_PORT: ${WAV2LIP_PORT}
+    restart: unless-stopped
+  animation:
+    image: ${REGISTRY:-opea}/animation:${TAG:-latest}
+    container_name: animation-server
+    ports:
+      - "3008:9066"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      WAV2LIP_ENDPOINT: ${WAV2LIP_ENDPOINT}
+    restart: unless-stopped
+  avatarchatbot-backend-server:
+    image: ${REGISTRY:-opea}/avatarchatbot:${TAG:-latest}
+    container_name: avatarchatbot-backend-server
+    depends_on:
+      - asr
+      - llm
+      - tts
+      - animation
+    ports:
+      - "3009:8888"
+    environment:
+      no_proxy: ${no_proxy}
+      https_proxy: ${https_proxy}
+      http_proxy: ${http_proxy}
+      MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP}
+      MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT}
+      ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
+      ASR_SERVICE_PORT: ${ASR_SERVICE_PORT}
+      LLM_SERVICE_HOST_IP: ${LLM_SERVICE_HOST_IP}
+      LLM_SERVICE_PORT: ${LLM_SERVICE_PORT}
+      LLM_SERVER_HOST_IP: ${LLM_SERVICE_HOST_IP}
+      LLM_SERVER_PORT: ${LLM_SERVICE_PORT}
+      TTS_SERVICE_HOST_IP: ${TTS_SERVICE_HOST_IP}
+      TTS_SERVICE_PORT: ${TTS_SERVICE_PORT}
+      ANIMATION_SERVICE_HOST_IP: ${ANIMATION_SERVICE_HOST_IP}
+      ANIMATION_SERVICE_PORT: ${ANIMATION_SERVICE_PORT}
+      WHISPER_SERVER_HOST_IP: ${WHISPER_SERVER_HOST_IP}
+      WHISPER_SERVER_PORT: ${WHISPER_SERVER_PORT}
+      SPEECHT5_SERVER_HOST_IP: ${SPEECHT5_SERVER_HOST_IP}
+      SPEECHT5_SERVER_PORT: ${SPEECHT5_SERVER_PORT}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/AvatarChatbot/docker_compose/amd/gpu/rocm/set_env.sh
+++ b/AvatarChatbot/docker_compose/amd/gpu/rocm/set_env.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export OPENAI_API_KEY=${OPENAI_API_KEY}
+export host_ip=$(hostname -I | awk '{print $1}')
+
+export TGI_SERVICE_PORT=3006
+export TGI_LLM_ENDPOINT=http://${host_ip}:${TGI_SERVICE_PORT}
+export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+
+export ASR_ENDPOINT=http://${host_ip}:7066
+export TTS_ENDPOINT=http://${host_ip}:7055
+export WAV2LIP_ENDPOINT=http://${host_ip}:7860
+
+export WHISPER_SERVER_HOST_IP=${host_ip}
+export WHISPER_SERVER_PORT=7066
+
+export SPEECHT5_SERVER_HOST_IP=${host_ip}
+export SPEECHT5_SERVER_PORT=7055
+
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export ASR_SERVICE_HOST_IP=${host_ip}
+export TTS_SERVICE_HOST_IP=${host_ip}
+export LLM_SERVICE_HOST_IP=${host_ip}
+export ANIMATION_SERVICE_HOST_IP=${host_ip}
+
+export MEGA_SERVICE_PORT=8888
+export ASR_SERVICE_PORT=3001
+export TTS_SERVICE_PORT=3002
+export LLM_SERVICE_PORT=3007
+export ANIMATION_SERVICE_PORT=3008
+
+export DEVICE="cpu"
+export WAV2LIP_PORT=7860
+export INFERENCE_MODE='wav2lip+gfpgan'
+export CHECKPOINT_PATH='/usr/local/lib/python3.11/site-packages/Wav2Lip/checkpoints/wav2lip_gan.pth'
+export FACE="/home/user/comps/animation/src/assets/img/avatar5.png"
+# export AUDIO='assets/audio/eg3_ref.wav' # audio file path is optional, will use base64str in the post request as input if is 'None'
+export AUDIO='None'
+export FACESIZE=96
+export OUTFILE="/outputs/result.mp4"
+export GFPGAN_MODEL_VERSION=1.4 # latest version, can roll back to v1.3 if needed
+export UPSCALE_FACTOR=1
+export FPS=10
--- a/AvatarChatbot/tests/test_compose_on_rocm.sh
+++ b/AvatarChatbot/tests/test_compose_on_rocm.sh
@@ -0,0 +1,170 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+if ls $LOG_PATH/*.log 1> /dev/null 2>&1; then
+    rm $LOG_PATH/*.log
+    echo "Log files removed."
+else
+    echo "No log files to remove."
+fi
+ip_address=$(hostname -I | awk '{print $1}')
+
+
+function build_docker_images() {
+    cd $WORKPATH/docker_image_build
+    git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="avatarchatbot whisper asr llm-textgen speecht5 tts wav2lip animation"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+
+    docker images && sleep 3s
+}
+
+
+function start_services() {
+    cd $WORKPATH/docker_compose/amd/gpu/rocm
+
+    export HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN
+    export OPENAI_API_KEY=$OPENAI_API_KEY
+    export host_ip=${ip_address}
+
+    export TGI_SERVICE_PORT=3006
+    export TGI_LLM_ENDPOINT=http://${host_ip}:${TGI_SERVICE_PORT}
+    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+
+    export ASR_ENDPOINT=http://${host_ip}:7066
+    export TTS_ENDPOINT=http://${host_ip}:7055
+    export WAV2LIP_ENDPOINT=http://${host_ip}:7860
+
+    export MEGA_SERVICE_HOST_IP=${host_ip}
+    export ASR_SERVICE_HOST_IP=${host_ip}
+    export TTS_SERVICE_HOST_IP=${host_ip}
+    export LLM_SERVICE_HOST_IP=${host_ip}
+    export ANIMATION_SERVICE_HOST_IP=${host_ip}
+    export WHISPER_SERVER_HOST_IP=${host_ip}
+    export WHISPER_SERVER_PORT=7066
+
+    export SPEECHT5_SERVER_HOST_IP=${host_ip}
+    export SPEECHT5_SERVER_PORT=7055
+
+    export MEGA_SERVICE_PORT=8888
+    export ASR_SERVICE_PORT=3001
+    export TTS_SERVICE_PORT=3002
+    export LLM_SERVICE_PORT=3007
+    export ANIMATION_SERVICE_PORT=3008
+
+    export DEVICE="cpu"
+    export WAV2LIP_PORT=7860
+    export INFERENCE_MODE='wav2lip+gfpgan'
+    export CHECKPOINT_PATH='/usr/local/lib/python3.11/site-packages/Wav2Lip/checkpoints/wav2lip_gan.pth'
+    export FACE="/home/user/comps/animation/src/assets/img/avatar5.png"
+    # export AUDIO='assets/audio/eg3_ref.wav' # audio file path is optional, will use base64str in the post request as input if is 'None'
+    export AUDIO='None'
+    export FACESIZE=96
+    export OUTFILE="./outputs/result.mp4"
+    export GFPGAN_MODEL_VERSION=1.4 # latest version, can roll back to v1.3 if needed
+    export UPSCALE_FACTOR=1
+    export FPS=5
+
+    # Start Docker Containers
+    docker compose up -d --force-recreate
+
+    echo "Check tgi-service status"
+
+    n=0
+    until [[ "$n" -ge 100 ]]; do
+       docker logs tgi-service > $LOG_PATH/tgi_service_start.log
+       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+           break
+       fi
+       sleep 5s
+       n=$((n+1))
+    done
+    echo "tgi-service are up and running"
+    sleep 5s
+
+    echo "Check wav2lip-service status"
+
+    n=0
+    until [[ "$n" -ge 100 ]]; do
+       docker logs wav2lip-service >& $LOG_PATH/wav2lip-service_start.log
+       if grep -q "Application startup complete" $LOG_PATH/wav2lip-service_start.log; then
+           break
+       fi
+       sleep 5s
+       n=$((n+1))
+    done
+    echo "wav2lip-service are up and running"
+    sleep 5s
+}
+
+
+function validate_megaservice() {
+    cd $WORKPATH
+    ls
+    result=$(http_proxy="" curl http://${ip_address}:3009/v1/avatarchatbot -X POST -d @assets/audio/sample_whoareyou.json -H 'Content-Type: application/json')
+    echo "result is === $result"
+    if [[ $result == *"mp4"* ]]; then
+        echo "Result correct."
+    else
+        docker logs whisper-service > $LOG_PATH/whisper-service.log
+        docker logs asr-service > $LOG_PATH/asr-service.log
+        docker logs speecht5-service > $LOG_PATH/speecht5-service.log
+        docker logs tts-service > $LOG_PATH/tts-service.log
+        docker logs tgi-service > $LOG_PATH/tgi-service.log
+        docker logs llm-tgi-server > $LOG_PATH/llm-tgi-server.log
+        docker logs wav2lip-service > $LOG_PATH/wav2lip-service.log
+        docker logs animation-server > $LOG_PATH/animation-server.log
+
+        echo "Result wrong."
+        exit 1
+    fi
+
+}
+
+
+#function validate_frontend() {
+
+#}
+
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/amd/gpu/rocm
+    docker compose down && docker compose rm -f
+}
+
+
+function main() {
+
+    echo $OPENAI_API_KEY
+    echo $OPENAI_KEY
+
+    stop_docker
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_services
+    # validate_microservices
+    sleep 30
+    validate_megaservice
+    # validate_frontend
+    stop_docker
+
+    echo y | docker system prune
+
+}
+
+
+main
--- a/ChatQnA/Dockerfile
+++ b/ChatQnA/Dockerfile
@@ -1,49 +1,10 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p $HOME && \
-    chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG

 COPY ./chatqna.py $HOME/chatqna.py
+COPY ./entrypoint.sh $HOME/entrypoint.sh

-ENTRYPOINT ["python", "chatqna.py"]
+ENTRYPOINT ["bash", "entrypoint.sh"]
--- a/ChatQnA/Dockerfile.guardrails
+++ b/ChatQnA/Dockerfile.guardrails
@@ -1,49 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p $HOME && \
-    chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
-
-COPY ./chatqna.py $HOME/chatqna.py
-
-ENTRYPOINT ["python", "chatqna.py", "--with-guardrails"]
--- a/ChatQnA/Dockerfile.without_rerank
+++ b/ChatQnA/Dockerfile.without_rerank
@@ -1,49 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p $HOME && \
-    chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
-
-COPY ./chatqna.py $HOME/chatqna.py
-
-ENTRYPOINT ["python", "chatqna.py", "--without-rerank"]
--- a/ChatQnA/README.md
+++ b/ChatQnA/README.md
@@ -68,13 +68,16 @@ To set up environment variables for deploying ChatQnA services, follow these ste

   ```bash
   # on Gaudi
-   source ./docker_compose/intel/hpu/gaudi/set_env.sh
-   export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails
+   cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
+   source ./set_env.sh
+   export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails,jaeger,prometheus,grafana,gaudi-node-exporter-1
   # on Xeon
-   source ./docker_compose/intel/cpu/xeon/set_env.sh
-   export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service
+   cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+   source ./set_env.sh
+   export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,jaeger,prometheus,grafana,xeon-node-exporter-1
   # on Nvidia GPU
-   source ./docker_compose/nvidia/gpu/set_env.sh
+   cd GenAIExamples/ChatQnA/docker_compose/nvidia/gpu
+   source ./set_env.sh
   export no_proxy="Your_No_Proxy",chatqna-ui-server,chatqna-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service
   ```

@@ -91,6 +94,14 @@ cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
 docker compose up -d
 ```

+To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.  
+CPU example with Open Telemetry feature:
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+```
+
 It will automatically download the docker image on `docker hub`:

 ```bash
@@ -232,6 +243,13 @@ cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
 docker compose up -d
 ```

+To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+```
+
 Refer to the [Gaudi Guide](./docker_compose/intel/hpu/gaudi/README.md) to build docker images from source.

 ### Deploy ChatQnA on Xeon
@@ -243,6 +261,13 @@ cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
 docker compose up -d
 ```

+To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+```
+
 Refer to the [Xeon Guide](./docker_compose/intel/cpu/xeon/README.md) for more instructions on building docker images from source.

 ### Deploy ChatQnA on NVIDIA GPU
@@ -346,7 +371,7 @@ OPEA microservice deployment can easily be monitored through Grafana dashboards

 ## Tracing Services with OpenTelemetry Tracing and Jaeger

-> NOTE: limited support. Only LLM inference serving with TGI on Gaudi is enabled for this feature.
+> NOTE: This feature is disabled by default. Please check the Deploy ChatQnA sessions for how to enable this feature with compose.telemetry.yaml file.

 OPEA microservice and TGI/TEI serving can easily be traced through Jaeger dashboards in conjunction with OpenTelemetry Tracing feature. Follow the [README](https://github.com/opea-project/GenAIComps/tree/main/comps/cores/telemetry#tracing) to trace additional functions if needed.

@@ -357,8 +382,17 @@ Users could also get the external IP via below command.
 ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+'
 ```

+Access the Jaeger dashboard UI at http://{EXTERNAL_IP}:16686
+
 For TGI serving on Gaudi, users could see different services like opea, TEI and TGI.
 ![Screenshot from 2024-12-27 11-58-18](https://github.com/user-attachments/assets/6126fa70-e830-4780-bd3f-83cb6eff064e)

 Here is a screenshot for one tracing of TGI serving request.
 ![Screenshot from 2024-12-27 11-26-25](https://github.com/user-attachments/assets/3a7c51c6-f422-41eb-8e82-c3df52cd48b8)
+
+There are also OPEA related tracings. Users could understand the time breakdown of each service request by looking into each opea:schedule operation.
+![image](https://github.com/user-attachments/assets/6137068b-b374-4ff8-b345-993343c0c25f)
+
+There could be async function such as `llm/MicroService_asyn_generate` and user needs to check the trace of the async function in another operation like
+opea:llm_generate_stream.
+![image](https://github.com/user-attachments/assets/a973d283-198f-4ce2-a7eb-58515b77503e)
--- a/ChatQnA/benchmark/accuracy_faqgen/README.md
+++ b/ChatQnA/benchmark/accuracy_faqgen/README.md
--- a/ChatQnA/benchmark/accuracy_faqgen/evaluate.py
+++ b/ChatQnA/benchmark/accuracy_faqgen/evaluate.py
--- a/ChatQnA/benchmark/accuracy_faqgen/generate_FAQ.py
+++ b/ChatQnA/benchmark/accuracy_faqgen/generate_FAQ.py
--- a/ChatQnA/benchmark/accuracy_faqgen/get_context.py
+++ b/ChatQnA/benchmark/accuracy_faqgen/get_context.py
--- a/ChatQnA/benchmark/accuracy_faqgen/launch_tgi.sh
+++ b/ChatQnA/benchmark/accuracy_faqgen/launch_tgi.sh
--- a/ChatQnA/benchmark/accuracy_faqgen/post_process_FAQ.py
+++ b/ChatQnA/benchmark/accuracy_faqgen/post_process_FAQ.py
--- a/ChatQnA/benchmark/accuracy_faqgen/run_acc.sh
+++ b/ChatQnA/benchmark/accuracy_faqgen/run_acc.sh
--- a/ChatQnA/benchmark/performance_faqgen/README.md
+++ b/ChatQnA/benchmark/performance_faqgen/README.md
--- a/ChatQnA/benchmark/performance_faqgen/benchmark.sh
+++ b/ChatQnA/benchmark/performance_faqgen/benchmark.sh
--- a/ChatQnA/benchmark/performance_faqgen/benchmark.yaml
+++ b/ChatQnA/benchmark/performance_faqgen/benchmark.yaml
--- a/ChatQnA/benchmark_chatqna.yaml
+++ b/ChatQnA/benchmark_chatqna.yaml
@@ -0,0 +1,112 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+deploy:
+  device: gaudi
+  version: 1.2.0
+  modelUseHostPath: /mnt/models
+  HUGGINGFACEHUB_API_TOKEN: "" # mandatory
+  node: [1, 2, 4, 8]
+  namespace: ""
+  timeout: 1000  # timeout in seconds for services to be ready, default 30 minutes
+  interval: 5    # interval in seconds between service ready checks, default 5 seconds
+
+  services:
+    backend:
+      resources:
+        enabled: False
+        cores_per_instance: "16"
+        memory_capacity: "8000Mi"
+      replicaCount: [1, 2, 4, 8]
+
+    teirerank:
+      enabled: True
+      model_id: ""
+      resources:
+        enabled: False
+        cards_per_instance: 1
+      replicaCount: [1, 1, 1, 1]
+
+    tei:
+      model_id: ""
+      resources:
+        enabled: False
+        cores_per_instance: "80"
+        memory_capacity: "20000Mi"
+      replicaCount: [1, 2, 4, 8]
+
+    llm:
+      engine: vllm  # or tgi
+      model_id: "meta-llama/Meta-Llama-3-8B-Instruct" # mandatory
+      replicaCount:
+        with_teirerank: [7, 15, 31, 63]     # When teirerank.enabled is True
+        without_teirerank: [8, 16, 32, 64]   # When teirerank.enabled is False
+      resources:
+        enabled: False
+        cards_per_instance: 1
+      model_params:
+        vllm:  # VLLM specific parameters
+          batch_params:
+            enabled: True
+            max_num_seqs: [1, 2, 4, 8]    # Each value triggers an LLM service upgrade
+          token_params:
+            enabled: False
+            max_input_length: ""
+            max_total_tokens: ""
+            max_batch_total_tokens: ""
+            max_batch_prefill_tokens: ""
+        tgi:   # TGI specific parameters
+          batch_params:
+            enabled: True
+            max_batch_size: [1, 2, 4, 8]  # Each value triggers an LLM service upgrade
+          token_params:
+            enabled: False
+            max_input_length: "1280"
+            max_total_tokens: "2048"
+            max_batch_total_tokens: "65536"
+            max_batch_prefill_tokens: "4096"
+
+    data-prep:
+      resources:
+        enabled: False
+        cores_per_instance: ""
+        memory_capacity: ""
+      replicaCount: [1, 1, 1, 1]
+
+    retriever-usvc:
+      resources:
+        enabled: False
+        cores_per_instance: "8"
+        memory_capacity: "8000Mi"
+      replicaCount: [1, 2, 4, 8]
+
+    redis-vector-db:
+      resources:
+        enabled: False
+        cores_per_instance: ""
+        memory_capacity: ""
+      replicaCount: [1, 1, 1, 1]
+
+    chatqna-ui:
+      replicaCount: [1, 1, 1, 1]
+
+    nginx:
+      replicaCount: [1, 1, 1, 1]
+
+benchmark:
+  # http request behavior related fields
+  user_queries:              [640]
+  concurrency:               [128]
+  load_shape_type:           "constant" # "constant" or "poisson"
+  poisson_arrival_rate:      1.0  # only used when load_shape_type is "poisson"
+  warmup_iterations:         10
+  seed:                      1024
+
+  # workload, all of the test cases will run for benchmark
+  bench_target: [chatqnafixed, chatqna_qlist_pubmed] # specify the bench_target for benchmark
+  dataset: ["/home/sdp/upload_file.txt", "/home/sdp/pubmed_10000.txt"]  # specify the absolute path to the dataset file
+  prompt: [10, 1000]  # set the prompt length for the chatqna_qlist_pubmed workload, set to 10 for chatqnafixed workload
+
+  llm:
+    # specify the llm output token size
+    max_token_size:          [128, 256]
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -159,7 +159,10 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
        next_data["inputs"] = prompt

    elif self.services[cur_node].service_type == ServiceType.LLM and not llm_parameters_dict["stream"]:
-        next_data["text"] = data["choices"][0]["message"]["content"]
+        if "faqgen" in self.services[cur_node].endpoint:
+            next_data = data
+        else:
+            next_data["text"] = data["choices"][0]["message"]["content"]
    else:
        next_data = data

@@ -167,7 +170,7 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di


 def align_generator(self, gen, **kwargs):
-    # openai reaponse format
+    # OpenAI response format
    # b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n'
    for line in gen:
        line = line.decode("utf-8")
@@ -178,7 +181,12 @@ def align_generator(self, gen, **kwargs):
        try:
            # sometimes yield empty chunk, do a fallback here
            json_data = json.loads(json_str)
-            if (
+            if "ops" in json_data and "op" in json_data["ops"][0]:
+                if "value" in json_data["ops"][0] and isinstance(json_data["ops"][0]["value"], str):
+                    yield f"data: {repr(json_data['ops'][0]['value'].encode('utf-8'))}\n\n"
+                else:
+                    pass
+            elif (
                json_data["choices"][0]["finish_reason"] != "eos_token"
                and "content" in json_data["choices"][0]["delta"]
            ):
@@ -329,6 +337,48 @@ class ChatQnAService:
        self.megaservice.flow_to(rerank, llm)
        # self.megaservice.flow_to(llm, guardrail_out)

+    def add_remote_service_faqgen(self):
+
+        embedding = MicroService(
+            name="embedding",
+            host=EMBEDDING_SERVER_HOST_IP,
+            port=EMBEDDING_SERVER_PORT,
+            endpoint="/embed",
+            use_remote_service=True,
+            service_type=ServiceType.EMBEDDING,
+        )
+
+        retriever = MicroService(
+            name="retriever",
+            host=RETRIEVER_SERVICE_HOST_IP,
+            port=RETRIEVER_SERVICE_PORT,
+            endpoint="/v1/retrieval",
+            use_remote_service=True,
+            service_type=ServiceType.RETRIEVER,
+        )
+
+        rerank = MicroService(
+            name="rerank",
+            host=RERANK_SERVER_HOST_IP,
+            port=RERANK_SERVER_PORT,
+            endpoint="/rerank",
+            use_remote_service=True,
+            service_type=ServiceType.RERANK,
+        )
+
+        llm = MicroService(
+            name="llm",
+            host=LLM_SERVER_HOST_IP,
+            port=LLM_SERVER_PORT,
+            endpoint="/v1/faqgen",
+            use_remote_service=True,
+            service_type=ServiceType.LLM,
+        )
+        self.megaservice.add(embedding).add(retriever).add(rerank).add(llm)
+        self.megaservice.flow_to(embedding, retriever)
+        self.megaservice.flow_to(retriever, rerank)
+        self.megaservice.flow_to(rerank, llm)
+
    async def handle_request(self, request: Request):
        data = await request.json()
        stream_opt = data.get("stream", True)
@@ -344,6 +394,7 @@ class ChatQnAService:
            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
            stream=stream_opt,
            chat_template=chat_request.chat_template if chat_request.chat_template else None,
+            model=chat_request.model if chat_request.model else None,
        )
        retriever_parameters = RetrieverParms(
            search_type=chat_request.search_type if chat_request.search_type else "similarity",
@@ -399,6 +450,7 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--without-rerank", action="store_true")
    parser.add_argument("--with-guardrails", action="store_true")
+    parser.add_argument("--faqgen", action="store_true")

    args = parser.parse_args()

@@ -407,6 +459,8 @@ if __name__ == "__main__":
        chatqna.add_remote_service_without_rerank()
    elif args.with_guardrails:
        chatqna.add_remote_service_with_guardrails()
+    elif args.faqgen:
+        chatqna.add_remote_service_faqgen()
    else:
        chatqna.add_remote_service()

--- a/ChatQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/README.md
@@ -1,4 +1,4 @@
-# Build and deploy CodeGen Application on AMD GPU (ROCm)
+# Build and deploy ChatQnA Application on AMD GPU (ROCm)

 ## Build MegaService of ChatQnA on AMD ROCm GPU

@@ -105,7 +105,15 @@ docker build --no-cache -t opea/retriever:latest --build-arg https_proxy=$https_
 docker build --no-cache -t opea/dataprep:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/src/Dockerfile .
 ```

-### 4. Build MegaService Docker Image
+### 4. Build FaqGen LLM Image (Optional)
+
+If you want to enable FAQ generation LLM in the pipeline, please use the below command:
+
+```bash
+docker build -t opea/llm-faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/faq-generation/Dockerfile .
+```
+
+### 5. Build MegaService Docker Image

 To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `chatqna.py` Python script. Build the MegaService Docker image using the command below:

@@ -116,7 +124,7 @@ docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_pr
 cd ../../..
 ```

-### 5. Build UI Docker Image
+### 6. Build UI Docker Image

 Construct the frontend Docker image using the command below:

@@ -126,7 +134,7 @@ docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https
 cd ../../../..
 ```

-### 6. Build React UI Docker Image (Optional)
+### 7. Build React UI Docker Image (Optional)

 Construct the frontend Docker image using the command below:

@@ -136,7 +144,7 @@ docker build --no-cache -t opea/chatqna-react-ui:latest --build-arg https_proxy=
 cd ../../../..
 ```

-### 7. Build Nginx Docker Image
+### 8. Build Nginx Docker Image

 ```bash
 cd GenAIComps
@@ -151,6 +159,10 @@ Then run the command `docker images`, you will have the following 5 Docker Image
 4. `opea/chatqna-ui:latest` or `opea/chatqna-react-ui:latest`
 5. `opea/nginx:latest`

+If FaqGen docker image is built, you will find one more image:
+
+- `opea/llm-faqgen:latest`
+
 ## 🚀 Start MicroServices and MegaService

 ### Required Models
@@ -190,6 +202,7 @@ Change the `xxx_MODEL_ID` below for your needs.
   export CHATQNA_REDIS_VECTOR_INSIGHT_PORT=8001
   export CHATQNA_REDIS_DATAPREP_PORT=6007
   export CHATQNA_REDIS_RETRIEVER_PORT=7000
+   export CHATQNA_LLM_FAQGEN_PORT=9000
   export CHATQNA_INDEX_NAME="rag-redis"
   export CHATQNA_MEGA_SERVICE_HOST_IP=${HOST_IP}
   export CHATQNA_RETRIEVER_SERVICE_HOST_IP=${HOST_IP}
@@ -246,7 +259,10 @@ Please find more information about accessing and restricting AMD GPUs in the lin

 ```bash
 cd GenAIExamples/ChatQnA/docker_compose/amd/gpu/rocm
+## for text generation
 docker compose up -d
+## for FAQ generation
+docker compose -f compose_faqgen.yaml up -d
 ```

 ### Validate MicroServices and MegaService
@@ -310,7 +326,16 @@ docker compose up -d
     -H 'Content-Type: application/json'
   ```

-5. MegaService
+5. FaqGen LLM Microservice (if enabled)
+
+```bash
+curl http://${host_ip}:${CHATQNA_LLM_FAQGEN_PORT}/v1/faqgen \
+  -X POST \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
+  -H 'Content-Type: application/json'
+```
+
+6. MegaService

   ```bash
   curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
@@ -318,7 +343,7 @@ docker compose up -d
        }'
   ```

-6. Nginx Service
+7. Nginx Service

   ```bash
   curl http://${host_ip}:${NGINX_PORT}/v1/chatqna \
@@ -326,7 +351,7 @@ docker compose up -d
       -d '{"messages": "What is the revenue of Nike in 2023?"}'
   ```

-7. Dataprep Microservice（Optional）
+8. Dataprep Microservice（Optional）

 If you want to update the default knowledge base, you can use the following commands:

--- a/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -30,7 +30,7 @@ services:
    ports:
      - "${CHATQNA_TEI_EMBEDDING_PORT}:80"
    volumes:
-      - "/var/opea/chatqna-service/data:/data"
+      - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
    shm_size: 1g
    ipc: host
    environment:
@@ -72,7 +72,7 @@ services:
    ports:
      - "${CHATQNA_TEI_RERANKING_PORT}:80"
    volumes:
-      - "/var/opea/chatqna-service/data:/data"
+      - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -104,7 +104,7 @@ services:
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
    volumes:
-      - "/var/opea/chatqna-service/data:/data"
+      - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
    shm_size: 1g
    devices:
      - /dev/kfd:/dev/kfd
--- a/ChatQnA/docker_compose/amd/gpu/rocm/compose_faqgen.yaml
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/compose_faqgen.yaml
@@ -0,0 +1,205 @@
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  chatqna-redis-vector-db:
+    image: redis/redis-stack:7.2.0-v9
+    container_name: redis-vector-db
+    ports:
+      - "${CHATQNA_REDIS_VECTOR_PORT}:6379"
+      - "${CHATQNA_REDIS_VECTOR_INSIGHT_PORT}:8001"
+  chatqna-dataprep-redis-service:
+    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
+    container_name: dataprep-redis-server
+    depends_on:
+      - chatqna-redis-vector-db
+      - chatqna-tei-embedding-service
+    ports:
+      - "${CHATQNA_REDIS_DATAPREP_PORT}:5000"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      REDIS_URL: ${CHATQNA_REDIS_URL}
+      INDEX_NAME: ${CHATQNA_INDEX_NAME}
+      TEI_ENDPOINT: ${CHATQNA_TEI_EMBEDDING_ENDPOINT}
+      HUGGINGFACEHUB_API_TOKEN: ${CHATQNA_HUGGINGFACEHUB_API_TOKEN}
+  chatqna-tei-embedding-service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    container_name: chatqna-tei-embedding-server
+    ports:
+      - "${CHATQNA_TEI_EMBEDDING_PORT}:80"
+    volumes:
+      - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
+    shm_size: 1g
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    command: --model-id ${CHATQNA_EMBEDDING_MODEL_ID} --auto-truncate
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri/card1:/dev/dri/card1
+      - /dev/dri/renderD136:/dev/dri/renderD136
+    cap_add:
+      - SYS_PTRACE
+    group_add:
+      - video
+    security_opt:
+      - seccomp:unconfined
+  chatqna-retriever:
+    image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
+    container_name: chatqna-retriever-redis-server
+    depends_on:
+      - chatqna-redis-vector-db
+    ports:
+      - "${CHATQNA_REDIS_RETRIEVER_PORT}:7000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      REDIS_URL: ${CHATQNA_REDIS_URL}
+      INDEX_NAME: ${CHATQNA_INDEX_NAME}
+      TEI_EMBEDDING_ENDPOINT: ${CHATQNA_TEI_EMBEDDING_ENDPOINT}
+      LOGFLAG: ${LOGFLAG}
+      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
+    restart: unless-stopped
+  chatqna-tei-reranking-service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    container_name: chatqna-tei-reranking-server
+    ports:
+      - "${CHATQNA_TEI_RERANKING_PORT}:80"
+    volumes:
+      - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_API_TOKEN: ${CHATQNA_HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri/:/dev/dri/
+    cap_add:
+      - SYS_PTRACE
+    group_add:
+      - video
+    security_opt:
+      - seccomp:unconfined
+    command: --model-id ${CHATQNA_RERANK_MODEL_ID} --auto-truncate
+  chatqna-tgi-service:
+    image: ${CHATQNA_TGI_SERVICE_IMAGE}
+    container_name: chatqna-tgi-server
+    ports:
+      - "${CHATQNA_TGI_SERVICE_PORT}:80"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGING_FACE_HUB_TOKEN: ${CHATQNA_HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+    volumes:
+      - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
+    shm_size: 1g
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri/:/dev/dri/
+    cap_add:
+      - SYS_PTRACE
+    group_add:
+      - video
+    security_opt:
+      - seccomp:unconfined
+    command: --model-id ${CHATQNA_LLM_MODEL_ID}
+    ipc: host
+  chatqna-llm-faqgen:
+    image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
+    container_name: llm-faqgen-server
+    depends_on:
+      - chatqna-tgi-service
+    ports:
+      - ${CHATQNA_LLM_FAQGEN_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      HF_TOKEN: ${HF_TOKEN}
+      FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME:-OpeaFaqGenTgi}
+      LOGFLAG: ${LOGFLAG:-False}
+    restart: unless-stopped
+  chatqna-backend-server:
+    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
+    container_name: chatqna-backend-server
+    depends_on:
+      - chatqna-redis-vector-db
+      - chatqna-tei-embedding-service
+      - chatqna-retriever
+      - chatqna-tei-reranking-service
+      - chatqna-tgi-service
+      - chatqna-llm-faqgen
+    ports:
+      - "${CHATQNA_BACKEND_SERVICE_PORT}:8888"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=${CHATQNA_MEGA_SERVICE_HOST_IP}
+      - EMBEDDING_SERVER_HOST_IP=${HOST_IP}
+      - EMBEDDING_SERVER_PORT=${CHATQNA_TEI_EMBEDDING_PORT:-80}
+      - RETRIEVER_SERVICE_HOST_IP=${HOST_IP}
+      - RERANK_SERVER_HOST_IP=${HOST_IP}
+      - RERANK_SERVER_PORT=${CHATQNA_TEI_RERANKING_PORT:-80}
+      - LLM_SERVER_HOST_IP=${HOST_IP}
+      - LLM_SERVER_PORT=${CHATQNA_LLM_FAQGEN_PORT:-9000}
+      - LLM_MODEL=${CHATQNA_LLM_MODEL_ID}
+      - CHATQNA_TYPE=${CHATQNA_TYPE:-CHATQNA_FAQGEN}
+    ipc: host
+    restart: always
+  chatqna-ui-server:
+    image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest}
+    container_name: chatqna-ui-server
+    depends_on:
+      - chatqna-backend-server
+    ports:
+      - "${CHATQNA_FRONTEND_SERVICE_PORT}:5173"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - CHAT_BASE_URL=${CHATQNA_BACKEND_SERVICE_ENDPOINT}
+      - UPLOAD_FILE_BASE_URL=${CHATQNA_DATAPREP_SERVICE_ENDPOINT}
+      - GET_FILE=${CHATQNA_DATAPREP_GET_FILE_ENDPOINT}
+      - DELETE_FILE=${CHATQNA_DATAPREP_DELETE_FILE_ENDPOINT}
+    ipc: host
+    restart: always
+  chatqna-nginx-server:
+    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
+    container_name: chaqna-nginx-server
+    depends_on:
+      - chatqna-backend-server
+      - chatqna-ui-server
+    ports:
+      - "${CHATQNA_NGINX_PORT}:80"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - FRONTEND_SERVICE_IP=${CHATQNA_FRONTEND_SERVICE_IP}
+      - FRONTEND_SERVICE_PORT=${CHATQNA_FRONTEND_SERVICE_PORT}
+      - BACKEND_SERVICE_NAME=${CHATQNA_BACKEND_SERVICE_NAME}
+      - BACKEND_SERVICE_IP=${CHATQNA_BACKEND_SERVICE_IP}
+      - BACKEND_SERVICE_PORT=${CHATQNA_BACKEND_SERVICE_PORT}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/ChatQnA/docker_compose/amd/gpu/rocm/set_env.sh
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/set_env.sh
@@ -15,6 +15,7 @@ export CHATQNA_REDIS_VECTOR_PORT=16379
 export CHATQNA_REDIS_VECTOR_INSIGHT_PORT=8001
 export CHATQNA_REDIS_DATAPREP_PORT=6007
 export CHATQNA_REDIS_RETRIEVER_PORT=7000
+export CHATQNA_LLM_FAQGEN_PORT=18010
 export CHATQNA_INDEX_NAME="rag-redis"
 export CHATQNA_MEGA_SERVICE_HOST_IP=${HOST_IP}
 export CHATQNA_RETRIEVER_SERVICE_HOST_IP=${HOST_IP}
--- a/ChatQnA/docker_compose/intel/cpu/aipc/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/aipc/README.md
@@ -2,6 +2,84 @@

 This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on AIPC. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`.

+## Quick Start:
+
+1. Set up the environment variables.
+2. Run Docker Compose.
+3. Consume the ChatQnA Service.
+
+### Quick Start: 1. Set up Environment Variable
+
+To set up environment variables for deploying ChatQnA services, follow these steps:
+
+```bash
+mkdir ~/OPEA -p
+cd ~/OPEA
+git clone https://github.com/opea-project/GenAIExamples.git
+cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/aipc
+```
+
+1. Set the required environment variables:
+
+   ```bash
+   export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
+   ```
+
+2. If you are in a proxy environment, also set the proxy-related environment variables:
+
+   ```bash
+   export https_proxy="Your_HTTPs_Proxy"
+   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+   export no_proxy=$no_proxy,chatqna-aipc-backend-server,tei-embedding-service,retriever,tei-reranking-service,redis-vector-db,dataprep-redis-service,ollama-service
+   ```
+
+3. Set up other environment variables
+
+   By default, llama3.2 is used for LLM serving, the default model can be changed to other LLM models. Please pick a [validated llm models](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/src/text-generation#validated-llm-models) from the table.  
+   To change the default model defined in set_env.sh, overwrite it by exporting OLLAMA_MODEL to the new model or by modifying set_env.sh.  
+   For example, change to using the following model.
+
+   ```bash
+   export OLLAMA_MODEL="deepseek-r1:8b"
+   ```
+
+   to use the [DeepSeek-R1-Distill-Llama-8B model](https://ollama.com/library/deepseek-r1:8b)
+
+   ```bash
+   source ./set_env.sh
+   ```
+
+### Quick Start: 2. Run Docker Compose
+
+```bash
+    docker compose up -d
+```
+
+It will take several minutes to automatically download the docker images
+
+NB: You should build docker image from source by yourself if:
+
+- You are developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
+- You can't download the docker image.
+- You want to use a specific version of Docker image.
+
+Please refer to ['Build Docker Images'](#🚀-build-docker-images) in below.
+
+### Quick Start:3. Consume the ChatQnA Service
+
+Once the services are up, open the following URL from your browser: http://{host_ip}:80.
+Enter Prompt like What is deep learning?
+
+Or if you prefer to try only on the localhost machine, then try
+
+```bash
+curl http://${host_ip}:8888/v1/chatqna \
+    -H "Content-Type: application/json" \
+    -d '{
+        "messages": "What is deep learning?"
+    }'
+```
+
 ## 🚀 Build Docker Images

 First of all, you need to build Docker Images locally and install the python package of it.
@@ -82,18 +160,18 @@ export host_ip="External_Public_IP"

 For Linux users, please run `hostname -I | awk '{print $1}'`. For Windows users, please run `ipconfig | findstr /i "IPv4"` to get the external public ip.

-**Export the value of your Huggingface API token to the `your_hf_api_token` environment variable**
+**Export the value of your Huggingface API token to the `HUGGINGFACEHUB_API_TOKEN` environment variable**

 > Change the Your_Huggingface_API_Token below with tyour actual Huggingface API Token value

 ```
-export your_hf_api_token="Your_Huggingface_API_Token"
+export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
 ```

 **Append the value of the public IP address to the no_proxy list if you are in a proxy environment**

 ```
-export your_no_proxy=${your_no_proxy},"External_Public_IP",chatqna-aipc-backend-server,tei-embedding-service,retriever,tei-reranking-service,redis-vector-db,dataprep-redis-service
+export your_no_proxy=${your_no_proxy},"External_Public_IP",chatqna-aipc-backend-server,tei-embedding-service,retriever,tei-reranking-service,redis-vector-db,dataprep-redis-service,ollama-service
 ```

 - Linux PC
@@ -105,7 +183,7 @@ export https_proxy=${your_http_proxy}
 export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export RERANK_MODEL_ID="BAAI/bge-reranker-base"
 export INDEX_NAME="rag-redis"
-export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export OLLAMA_HOST=${host_ip}
 export OLLAMA_MODEL="llama3.2"
 ```
@@ -116,7 +194,7 @@ export OLLAMA_MODEL="llama3.2"
 set EMBEDDING_MODEL_ID=BAAI/bge-base-en-v1.5
 set RERANK_MODEL_ID=BAAI/bge-reranker-base
 set INDEX_NAME=rag-redis
-set HUGGINGFACEHUB_API_TOKEN=%your_hf_api_token%
+set HUGGINGFACEHUB_API_TOKEN=%HUGGINGFACEHUB_API_TOKEN%
 set OLLAMA_HOST=host.docker.internal
 set OLLAMA_MODEL="llama3.2"
 ```
--- a/ChatQnA/docker_compose/intel/cpu/aipc/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/aipc/compose.yaml
@@ -26,7 +26,7 @@ services:
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
    container_name: tei-embedding-server
    ports:
      - "6006:80"
@@ -59,7 +59,7 @@ services:
      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
    restart: unless-stopped
  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6
    container_name: tei-reranking-server
    ports:
      - "8808:80"
@@ -109,7 +109,7 @@ services:
      - RETRIEVER_SERVICE_HOST_IP=retriever
      - RERANK_SERVER_HOST_IP=tei-reranking-service
      - RERANK_SERVER_PORT=80
-      - LLM_SERVER_HOST_IP=${OLLAMA_HOST}
+      - LLM_SERVER_HOST_IP=ollama-service
      - LLM_SERVER_PORT=11434
      - LLM_MODEL=${OLLAMA_MODEL}
      - LOGFLAG=${LOGFLAG}
--- a/ChatQnA/docker_compose/intel/cpu/aipc/set_env.sh
+++ b/ChatQnA/docker_compose/intel/cpu/aipc/set_env.sh
@@ -7,15 +7,17 @@ pushd "../../../../../" > /dev/null
 source .set_env.sh
 popd > /dev/null

-if [ -z "${your_hf_api_token}" ]; then
-    echo "Error: HUGGINGFACEHUB_API_TOKEN is not set. Please set your_hf_api_token."
+export host_ip=$(hostname -I | awk '{print $1}')
+
+if [ -z "${HUGGINGFACEHUB_API_TOKEN}" ]; then
+    echo "Error: HUGGINGFACEHUB_API_TOKEN is not set. Please set HUGGINGFACEHUB_API_TOKEN."
 fi

 if [ -z "${host_ip}" ]; then
    echo "Error: host_ip is not set. Please set host_ip first."
 fi

-export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export RERANK_MODEL_ID="BAAI/bge-reranker-base"
 export INDEX_NAME="rag-redis"
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -1,6 +1,6 @@
 # Build Mega Service of ChatQnA on Xeon

-This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`.
+This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`,`llm` and `faqgen`.

 The default pipeline deploys with vLLM as the LLM serving component and leverages rerank component. It also provides options of not using rerank in the pipeline and using TGI backend for LLM microservice, please refer to [start-all-the-services-docker-containers](#start-all-the-services-docker-containers) section in this page. Besides, refer to [Build with Pinecone VectorDB](./README_pinecone.md) and [Build with Qdrant VectorDB](./README_qdrant.md) for other deployment variants.

@@ -30,20 +30,42 @@ To set up environment variables for deploying ChatQnA services, follow these ste
   export http_proxy="Your_HTTP_Proxy"
   export https_proxy="Your_HTTPs_Proxy"
   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
-   export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service
+   export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,llm-faqgen
   ```

 3. Set up other environment variables:
+
   ```bash
   source ./set_env.sh
   ```

+4. Change Model for LLM serving
+
+   By default, Meta-Llama-3-8B-Instruct is used for LLM serving, the default model can be changed to other validated LLM models.  
+   Please pick a [validated llm models](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/src/text-generation#validated-llm-models) from the table.  
+   To change the default model defined in set_env.sh, overwrite it by exporting LLM_MODEL_ID to the new model or by modifying set_env.sh, and then repeat step 3.  
+   For example, change to Llama-2-7b-chat-hf using the following command.
+
+   ```bash
+   export LLM_MODEL_ID="meta-llama/Llama-2-7b-chat-hf"
+   ```
+
 ## Quick Start: 2.Run Docker Compose

 ```bash
 docker compose up -d
 ```

+To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.  
+CPU example with Open Telemetry feature:
+
+> NOTE : To get supported Grafana Dashboard, please run download_opea_dashboard.sh following below commands.
+
+```bash
+./grafana/dashboards/download_opea_dashboard.sh
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+```
+
 It will automatically download the docker image on `docker hub`:

 ```bash
@@ -119,29 +141,27 @@ docker build --no-cache -t opea/dataprep:latest --build-arg https_proxy=$https_p
 cd ..
 ```

-### 3. Build MegaService Docker Image
+### 3. Build FaqGen LLM Image (Optional)

-1. MegaService with Rerank
+If you want to enable FAQ generation LLM in the pipeline, please use the below command:

-   To construct the Mega Service with Rerank, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `chatqna.py` Python script. Build MegaService Docker image via below command:
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+docker build -t opea/llm-faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/faq-generation/Dockerfile .
+```

-   ```bash
-   git clone https://github.com/opea-project/GenAIExamples.git
-   cd GenAIExamples/ChatQnA
-   docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-   ```
+### 4. Build MegaService Docker Image

-2. MegaService without Rerank
+To construct the Mega Service with Rerank, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `chatqna.py` Python script. Build MegaService Docker image via below command:

-   To construct the Mega Service without Rerank, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `chatqna_without_rerank.py` Python script. Build MegaService Docker image via below command:
+```bash
+git clone https://github.com/opea-project/GenAIExamples.git
+cd GenAIExamples/ChatQnA
+docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```

-   ```bash
-   git clone https://github.com/opea-project/GenAIExamples.git
-   cd GenAIExamples/ChatQnA
-   docker build --no-cache -t opea/chatqna-without-rerank:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.without_rerank .
-   ```
-
-### 4. Build UI Docker Image
+### 5. Build UI Docker Image

 Build frontend Docker image via below command:

@@ -150,7 +170,7 @@ cd GenAIExamples/ChatQnA/ui
 docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
 ```

-### 5. Build Conversational React UI Docker Image (Optional)
+### 6. Build Conversational React UI Docker Image (Optional)

 Build frontend Docker image that enables Conversational experience with ChatQnA megaservice via below command:

@@ -161,7 +181,7 @@ cd GenAIExamples/ChatQnA/ui
 docker build --no-cache -t opea/chatqna-conversation-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
 ```

-### 6. Build Nginx Docker Image
+### 7. Build Nginx Docker Image

 ```bash
 cd GenAIComps
@@ -172,10 +192,14 @@ Then run the command `docker images`, you will have the following 5 Docker Image

 1. `opea/dataprep:latest`
 2. `opea/retriever:latest`
-3. `opea/chatqna:latest` or `opea/chatqna-without-rerank:latest`
+3. `opea/chatqna:latest`
 4. `opea/chatqna-ui:latest`
 5. `opea/nginx:latest`

+If FaqGen related docker image is built, you will find one more image:
+
+- `opea/llm-faqgen:latest`
+
 ## 🚀 Start Microservices

 ### Required Models
@@ -199,7 +223,7 @@ For users in China who are unable to download models directly from Huggingface,
   export HF_ENDPOINT="https://hf-mirror.com"
   model_name="meta-llama/Meta-Llama-3-8B-Instruct"
   # Start vLLM LLM Service
-   docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
+   docker run -p 8008:80 -v ./data:/root/.cache/huggingface/hub --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
   # Start TGI LLM Service
   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
   ```
@@ -216,7 +240,7 @@ For users in China who are unable to download models directly from Huggingface,
     export HF_TOKEN=${your_hf_token}
     export model_path="/path/to/model"
     # Start vLLM LLM Service
-     docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
+     docker run -p 8008:80 -v $model_path:/root/.cache/huggingface/hub --name vllm-service --shm-size 128g opea/vllm:latest --model /root/.cache/huggingface/hub --host 0.0.0.0 --port 80
     # Start TGI LLM Service
     docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
     ```
@@ -263,12 +287,20 @@ If use vLLM as the LLM serving backend.
 docker compose -f compose.yaml up -d
 # Start ChatQnA without Rerank Pipeline
 docker compose -f compose_without_rerank.yaml up -d
+# Start ChatQnA with Rerank Pipeline and Open Telemetry Tracing
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+# Start ChatQnA with FaqGen Pipeline
+docker compose -f compose_faqgen.yaml up -d
 ```

 If use TGI as the LLM serving backend.

 ```bash
 docker compose -f compose_tgi.yaml up -d
+# Start ChatQnA with Open Telemetry Tracing
+docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml up -d
+# Start ChatQnA with FaqGen Pipeline
+docker compose -f compose_faqgen_tgi.yaml up -d
 ```

 ### Validate Microservices
@@ -343,7 +375,16 @@ For details on how to verify the correctness of the response, refer to [how-to-v
     -H 'Content-Type: application/json'
   ```

-5. MegaService
+5. FaqGen LLM Microservice (if enabled)
+
+```bash
+curl http://${host_ip}:${LLM_SERVICE_PORT}/v1/faqgen \
+  -X POST \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
+  -H 'Content-Type: application/json'
+```
+
+6. MegaService

   ```bash
    curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
@@ -351,7 +392,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
          }'
   ```

-6. Nginx Service
+7. Nginx Service

   ```bash
   curl http://${host_ip}:${NGINX_PORT}/v1/chatqna \
@@ -359,7 +400,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
       -d '{"messages": "What is the revenue of Nike in 2023?"}'
   ```

-7. Dataprep Microservice（Optional）
+8. Dataprep Microservice（Optional）

 If you want to update the default knowledge base, you can use the following commands:

--- a/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md
@@ -201,7 +201,7 @@ For users in China who are unable to download models directly from Huggingface,
   export HF_TOKEN=${your_hf_token}
   export HF_ENDPOINT="https://hf-mirror.com"
   model_name="meta-llama/Meta-Llama-3-8B-Instruct"
-   docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
+   docker run -p 8008:80 -v ./data:/root/.cache/huggingface/hub --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
   ```

 2. Offline
@@ -215,7 +215,7 @@ For users in China who are unable to download models directly from Huggingface,
     ```bash
     export HF_TOKEN=${your_hf_token}
     export model_path="/path/to/model"
-     docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
+     docker run -p 8008:80 -v $model_path:/root/.cache/huggingface/hub --name vllm-service --shm-size 128g opea/vllm:latest --model /root/.cache/huggingface/hub --host 0.0.0.0 --port 80
     ```

 ### Setup Environment Variables
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml
@@ -0,0 +1,80 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  tei-embedding-service:
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+    environment:
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
+  tei-reranking-service:
+    command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+    environment:
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
+#  vllm-service:
+#    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --otlp-traces-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  chatqna-xeon-backend-server:
+    environment:
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
+  jaeger:
+    image: jaegertracing/all-in-one:1.67.0
+    container_name: jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
+      - "9411:9411"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      COLLECTOR_ZIPKIN_HOST_PORT: 9411
+    restart: unless-stopped
+  prometheus:
+    image: prom/prometheus:v2.52.0
+    container_name: prometheus
+    user: root
+    volumes:
+      - ./prometheus.yaml:/etc/prometheus/prometheus.yaml
+      - ./prometheus_data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yaml'
+    ports:
+      - '9090:9090'
+    ipc: host
+    restart: unless-stopped
+  grafana:
+    image: grafana/grafana:11.0.0
+    container_name: grafana
+    volumes:
+      - ./grafana_data:/var/lib/grafana
+      - ./grafana/dashboards:/var/lib/grafana/dashboards
+      - ./grafana/provisioning:/etc/grafana/provisioning
+    user: root
+    environment:
+      GF_SECURITY_ADMIN_PASSWORD: admin
+      GF_RENDERING_CALLBACK_URL: http://grafana:3000/
+      GF_LOG_FILTERS: rendering:debug
+    depends_on:
+      - prometheus
+    ports:
+      - '3000:3000'
+    ipc: host
+    restart: unless-stopped
+  node-exporter:
+    image: prom/node-exporter
+    container_name: node-exporter
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.sysfs=/host/sys'
+      - --collector.filesystem.ignored-mount-points
+      - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
+    ports:
+      - 9100:9100
+    restart: always
+    deploy:
+      mode: global
--- a/Show More
+++ b/Show More