[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
update translation UI response format
2025-03-21 08:03:31 +00:00 · 2025-03-21 15:57:43 +08:00 · 2025-03-20 21:38:24 -07:00 · 2025-03-21 09:42:20 +08:00 · 2025-03-20 17:40:00 +08:00 · 2025-03-20 10:46:01 +08:00
384 changed files with 14530 additions and 6735 deletions
--- a/.github/code_spell_ignore.txt
+++ b/.github/code_spell_ignore.txt
@@ -1,2 +1,3 @@
 ModelIn
 modelin
+pressEnter
--- a/.github/workflows/_build_comps_base_image.yml
+++ b/.github/workflows/_build_comps_base_image.yml
@@ -0,0 +1,65 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+name: Build Comps Base Image
+permissions: read-all
+on:
+  workflow_call:
+    inputs:
+      node:
+        required: true
+        type: string
+      build:
+        default: true
+        required: false
+        type: boolean
+      tag:
+        default: "latest"
+        required: false
+        type: string
+      opea_branch:
+        default: "main"
+        required: false
+        type: string
+      inject_commit:
+        default: false
+        required: false
+        type: boolean
+
+jobs:
+  pre-build-image-check:
+    runs-on: ubuntu-latest
+    outputs:
+      should_skip: ${{ steps.check-skip.outputs.should_skip }}
+    steps:
+      - name: Check if job should be skipped
+        id: check-skip
+        run: |
+          should_skip=false
+          if [[ "${{ inputs.node }}" == "gaudi3" || "${{ inputs.node }}" == "rocm" || "${{ inputs.node }}" == "arc" ]]; then
+            should_skip=true
+          fi
+          echo "should_skip=$should_skip"
+          echo "should_skip=$should_skip" >> $GITHUB_OUTPUT
+
+  build-images:
+    needs: [ pre-build-image-check ]
+    if: ${{ needs.pre-build-image-check.outputs.should_skip == 'false' && fromJSON(inputs.build) }}
+    runs-on: "docker-build-${{ inputs.node }}"
+    steps:
+      - name: Clean Up Working Directory
+        run: sudo rm -rf ${{github.workspace}}/*
+
+      - name: Clone Required Repo
+        run: |
+          git clone --depth 1 --branch ${{ inputs.opea_branch }} https://github.com/opea-project/GenAIComps.git
+          cd GenAIComps && git rev-parse HEAD && cd ../ && ls -l
+
+      - name: Build Image
+        uses: opea-project/validation/actions/image-build@main
+        with:
+          work_dir: ${{ github.workspace }}/GenAIComps
+          docker_compose_path: ${{ github.workspace }}/GenAIComps/.github/workflows/docker/compose/base-compose.yaml
+          registry: ${OPEA_IMAGE_REPO}opea
+          inject_commit: ${{ inputs.inject_commit }}
+          tag: ${{ inputs.tag }}
--- a/.github/workflows/_example-workflow.yml
+++ b/.github/workflows/_example-workflow.yml
@@ -53,7 +53,23 @@ jobs:
 ####################################################################################################
 # Image Build
 ####################################################################################################
+  pre-build-image-check:
+    runs-on: ubuntu-latest
+    outputs:
+      should_skip: ${{ steps.check-skip.outputs.should_skip }}
+    steps:
+      - name: Check if job should be skipped
+        id: check-skip
+        run: |
+          if [[ "${{ inputs.node }}" == "gaudi3" || "${{ inputs.node }}" == "rocm" || "${{ inputs.node }}" == "arc" ]]; then
+            echo "should_skip=true" >> $GITHUB_OUTPUT
+          else
+            echo "should_skip=false" >> $GITHUB_OUTPUT
+          fi
+
  build-images:
+    needs: [pre-build-image-check]
+    if: ${{ needs.pre-build-image-check.outputs.should_skip == 'false' }}
    runs-on: "docker-build-${{ inputs.node }}"
    steps:
      - name: Clean Up Working Directory
@@ -78,16 +94,18 @@ jobs:
          cd ${{ github.workspace }}/${{ inputs.example }}/docker_image_build
          docker_compose_path=${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml
          if [[ $(grep -c "vllm:" ${docker_compose_path}) != 0 ]]; then
-              git clone https://github.com/vllm-project/vllm.git  && cd vllm
+              git clone https://github.com/vllm-project/vllm.git && cd vllm
              # Get the latest tag
-              VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+              VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
              echo "Check out vLLM tag ${VLLM_VER}"
-              git checkout ${VLLM_VER} &> /dev/null
-              # make sure do not change the pwd
-              git rev-parse HEAD && cd ../
+              git checkout ${VLLM_VER} &> /dev/null && cd ../
          fi
          if [[ $(grep -c "vllm-gaudi:" ${docker_compose_path}) != 0 ]]; then
-              git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
+              git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork
+              # Get the latest tag
+              VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+              echo "Check out vLLM tag ${VLLM_VER}"
+              git checkout ${VLLM_VER} &> /dev/null && cd ../
          fi
          git clone --depth 1 --branch ${{ inputs.opea_branch }} https://github.com/opea-project/GenAIComps.git
          cd GenAIComps && git rev-parse HEAD && cd ../
@@ -103,12 +121,33 @@ jobs:
          inject_commit: ${{ inputs.inject_commit }}
          tag: ${{ inputs.tag }}

+  pre-compose-test-check:
+    needs: [pre-build-image-check, build-images]
+    if: always()
+    runs-on: ubuntu-latest
+    outputs:
+      run_compose: ${{ steps.check-compose.outputs.run_compose }}
+    steps:
+      - name: Check if job should be skipped
+        id: check-compose
+        run: |
+          set -x
+          run_compose="false"
+          if [[ "${{ inputs.test_compose }}" == "true" ]]; then
+            if [[ "${{ needs.pre-build-image-check.outputs.should_skip }}" == "false" && "${{ needs.build-images.result}}" == "success" || "${{ needs.pre-build-image-check.outputs.should_skip }}" == "true" ]]; then
+              run_compose="true"
+            fi
+          fi
+          echo "run_compose=$run_compose"
+          echo "run_compose=$run_compose" >> $GITHUB_OUTPUT
+
+
 ####################################################################################################
 # Docker Compose Test
 ####################################################################################################
  test-example-compose:
-    needs: [build-images]
-    if: ${{ fromJSON(inputs.test_compose) }}
+    needs: [pre-compose-test-check]
+    if: ${{ always() && needs.pre-compose-test-check.outputs.run_compose == 'true' }}
    uses: ./.github/workflows/_run-docker-compose.yml
    with:
      tag: ${{ inputs.tag }}
--- a/.github/workflows/_run-docker-compose.yml
+++ b/.github/workflows/_run-docker-compose.yml
@@ -64,9 +64,14 @@ jobs:
          cd ${{ github.workspace }}/${{ inputs.example }}/tests
          run_test_cases=""

-          default_test_case=$(find . -type f -name "test_compose_on_${{ inputs.hardware }}.sh" | cut -d/ -f2)
+          if [ "${{ inputs.hardware }}" == "gaudi2" ] || [ "${{ inputs.hardware }}" == "gaudi3" ]; then
+            hardware="gaudi"
+          else
+            hardware="${{ inputs.hardware }}"
+          fi
+          default_test_case=$(find . -type f -name "test_compose_on_$hardware.sh" | cut -d/ -f2)
          if [ "$default_test_case" ]; then run_test_cases="$default_test_case"; fi
-          other_test_cases=$(find . -type f -name "test_compose_*_on_${{ inputs.hardware }}.sh" | cut -d/ -f2)
+          other_test_cases=$(find . -type f -name "test_compose_*_on_$hardware.sh" | cut -d/ -f2)
          echo "default_test_case=$default_test_case"
          echo "other_test_cases=$other_test_cases"

@@ -99,7 +104,7 @@ jobs:

  compose-test:
    needs: [get-test-case]
-    if: ${{ needs.get-test-case.outputs.test_cases != '' }}
+    if: ${{ needs.get-test-case.outputs.test_cases != '[""]' }}
    strategy:
      matrix:
        test_case: ${{ fromJSON(needs.get-test-case.outputs.test_cases) }}
@@ -160,7 +165,7 @@ jobs:
              export model_cache="~/.cache/huggingface/hub"
            fi
          fi
-          if [ -f ${test_case} ]; then timeout 30m bash ${test_case}; else echo "Test script {${test_case}} not found, skip test!"; fi
+          if [ -f "${test_case}" ]; then timeout 30m bash "${test_case}"; else echo "Test script {${test_case}} not found, skip test!"; fi

      - name: Clean up container after test
        shell: bash
--- a/.github/workflows/daily_check_issue_and_pr.yml
+++ b/.github/workflows/daily_check_issue_and_pr.yml
@@ -0,0 +1,28 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+name: Check stale issue and pr
+
+on:
+  schedule:
+    - cron: "30 22 * * *"
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v9
+        with:
+          days-before-issue-stale: 30
+          days-before-pr-stale: 30
+          days-before-issue-close: 7
+          days-before-pr-close: 7
+          stale-issue-message: "This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 7 days."
+          stale-pr-message: "This PR is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 7 days."
+          close-issue-message: "This issue was closed because it has been stalled for 7 days with no activity."
+          close-pr-message: "This PR was closed because it has been stalled for 7 days with no activity."
+          repo-token: ${{ secrets.ACTION_TOKEN }}
+          start-date: "2025-03-01T00:00:00Z"
--- a/.github/workflows/manual-docker-scan.yml
+++ b/.github/workflows/manual-docker-scan.yml
@@ -12,7 +12,7 @@ on:
        type: string
      examples:
        default: ""
-        description: 'List of examples to publish "AgentQnA,AudioQnA,ChatQnA,CodeGen,CodeTrans,DocIndexRetriever,DocSum,FaqGen,InstructionTuning,MultimodalQnA,ProductivitySuite,RerankFinetuning,SearchQnA,Translation,VideoQnA,VisualQnA"'
+        description: 'List of examples to publish "AgentQnA,AudioQnA,ChatQnA,CodeGen,CodeTrans,DocIndexRetriever,DocSum,InstructionTuning,MultimodalQnA,ProductivitySuite,RerankFinetuning,SearchQnA,Translation,VideoQnA,VisualQnA"'
        required: false
        type: string
      images:
--- a/.github/workflows/manual-example-workflow.yml
+++ b/.github/workflows/manual-example-workflow.yml
@@ -7,7 +7,7 @@ on:
    inputs:
      nodes:
        default: "gaudi,xeon"
-        description: "Hardware to run test"
+        description: "Hardware to run test gaudi,gaudi3,xeon,rocm,arc"
        required: true
        type: string
      examples:
@@ -20,11 +20,6 @@ on:
        description: "Tag to apply to images"
        required: true
        type: string
-      # deploy_gmc:
-      #   default: false
-      #   description: 'Whether to deploy gmc'
-      #   required: true
-      #   type: boolean
      build:
        default: true
        description: 'Build test required images for Examples'
@@ -40,11 +35,6 @@ on:
        description: 'Test examples with helm charts'
        required: false
        type: boolean
-      # test_gmc:
-      #   default: false
-      #   description: 'Test examples with gmc'
-      #   required: false
-      #   type: boolean
      opea_branch:
        default: "main"
        description: 'OPEA branch for image build'
@@ -52,12 +42,12 @@ on:
        type: string
      inject_commit:
        default: false
-        description: "inject commit to docker images true or false"
+        description: "inject commit to docker images"
        required: false
        type: boolean
      use_model_cache:
        default: false
-        description: "use model cache true or false"
+        description: "use model cache"
        required: false
        type: boolean

@@ -79,24 +69,20 @@ jobs:
        nodes_json=$(printf '%s\n' "${nodes[@]}" | sort -u | jq -R '.' | jq -sc '.')
        echo "nodes=$nodes_json" >> $GITHUB_OUTPUT

-  build-deploy-gmc:
+  build-comps-base:
    needs: [get-test-matrix]
-    if: false
-    #${{ fromJSON(inputs.deploy_gmc) }}
    strategy:
      matrix:
        node: ${{ fromJson(needs.get-test-matrix.outputs.nodes) }}
-      fail-fast: false
-    uses: ./.github/workflows/_gmc-workflow.yml
+    uses: ./.github/workflows/_build_comps_base_image.yml
    with:
      node: ${{ matrix.node }}
+      build: ${{ fromJSON(inputs.build) }}
      tag: ${{ inputs.tag }}
      opea_branch: ${{ inputs.opea_branch }}
-    secrets: inherit

  run-examples:
-    needs: [get-test-matrix]   #[get-test-matrix, build-deploy-gmc]
-    if: always()
+    needs: [get-test-matrix, build-comps-base]
    strategy:
      matrix:
        example: ${{ fromJson(needs.get-test-matrix.outputs.examples) }}
@@ -110,7 +96,6 @@ jobs:
      build: ${{ fromJSON(inputs.build) }}
      test_compose: ${{ fromJSON(inputs.test_compose) }}
      test_helmchart: ${{ fromJSON(inputs.test_helmchart) }}
-      # test_gmc: ${{ fromJSON(inputs.test_gmc) }}
      opea_branch: ${{ inputs.opea_branch }}
      inject_commit: ${{ inputs.inject_commit }}
      use_model_cache: ${{ inputs.use_model_cache }}
--- a/.github/workflows/manual-image-build.yml
+++ b/.github/workflows/manual-image-build.yml
@@ -32,9 +32,9 @@ on:
        type: string
      inject_commit:
        default: false
-        description: "inject commit to docker images true or false"
+        description: "inject commit to docker images"
        required: false
-        type: string
+        type: boolean

 jobs:
  get-test-matrix:
--- a/.github/workflows/nightly-docker-build-publish.yml
+++ b/.github/workflows/nightly-docker-build-publish.yml
@@ -32,6 +32,12 @@ jobs:
          echo "TAG=$TAG" >> $GITHUB_OUTPUT
          echo "PUBLISH_TAGS=$PUBLISH_TAGS" >> $GITHUB_OUTPUT

+  build-comps-base:
+    needs: [get-build-matrix]
+    uses: ./.github/workflows/_build_comps_base_image.yml
+    with:
+      node: gaudi
+
  build-and-test:
    needs: get-build-matrix
    if: ${{ needs.get-build-matrix.outputs.examples_json != '' }}
@@ -44,6 +50,7 @@ jobs:
      node: gaudi
      example: ${{ matrix.example }}
      test_compose: true
+      inject_commit: true
    secrets: inherit

  get-image-list:
--- a/.github/workflows/weekly-update-images.yml
+++ b/.github/workflows/weekly-update-images.yml
@@ -1,11 +1,9 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-name: Weekly update base images and 3rd party images
+name: Weekly update 3rd party images

 on:
-  schedule:
-    - cron: "0 0 * * 0"
  workflow_dispatch:

 permissions:
--- a/AgentQnA/README.md
+++ b/AgentQnA/README.md
@@ -2,7 +2,7 @@

 ## Overview

-This example showcases a hierarchical multi-agent system for question-answering applications. The architecture diagram is shown below. The supervisor agent interfaces with the user and dispatch tasks to two worker agents to gather information and come up with answers. The worker RAG agent uses the retrieval tool to retrieve relevant documents from the knowledge base (a vector database). The worker SQL agent retrieve relevant data from the SQL database. Although not included in this example, but other tools such as a web search tool or a knowledge graph query tool can be used by the supervisor agent to gather information from additional sources.
+This example showcases a hierarchical multi-agent system for question-answering applications. The architecture diagram below shows a supervisor agent that interfaces with the user and dispatches tasks to two worker agents to gather information and come up with answers. The worker RAG agent uses the retrieval tool to retrieve relevant documents from a knowledge base - a vector database. The worker SQL agent retrieves relevant data from a SQL database. Although not included in this example by default, other tools such as a web search tool or a knowledge graph query tool can be used by the supervisor agent to gather information from additional sources.
 ![Architecture Overview](assets/img/agent_qna_arch.png)

 The AgentQnA example is implemented using the component-level microservices defined in [GenAIComps](https://github.com/opea-project/GenAIComps). The flow chart below shows the information flow between different microservices for this example.
@@ -75,190 +75,161 @@ flowchart LR

 ```

-### Why Agent for question answering?
+### Why should AI Agents be used for question-answering?

-1. Improve relevancy of retrieved context.
-   RAG agent can rephrase user queries, decompose user queries, and iterate to get the most relevant context for answering user's questions. Compared to conventional RAG, RAG agent can significantly improve the correctness and relevancy of the answer.
-2. Expand scope of the agent.
-   The supervisor agent can interact with multiple worker agents that specialize in different domains with different skills (e.g., retrieve documents, write SQL queries, etc.), and thus can answer questions in multiple domains.
-3. Hierarchical multi-agents can improve performance.
-   Expert worker agents, such as RAG agent and SQL agent, can provide high-quality output for different aspects of a complex query, and the supervisor agent can aggregate the information together to provide a comprehensive answer. If we only use one agent and provide all the tools to this single agent, it may get overwhelmed and not able to provide accurate answers.
+1. **Improve relevancy of retrieved context.**
+   RAG agents can rephrase user queries, decompose user queries, and iterate to get the most relevant context for answering a user's question. Compared to conventional RAG, RAG agents significantly improve the correctness and relevancy of the answer because of the iterations it goes through.
+2. **Expand scope of skills.**
+   The supervisor agent interacts with multiple worker agents that specialize in different skills (e.g., retrieve documents, write SQL queries, etc.). Thus, it can answer questions with different methods.
+3. **Hierarchical multi-agents improve performance.**
+   Expert worker agents, such as RAG agents and SQL agents, can provide high-quality output for different aspects of a complex query, and the supervisor agent can aggregate the information to provide a comprehensive answer. If only one agent is used and all tools are provided to this single agent, it can lead to large overhead or not use the best tool to provide accurate answers.

 ## Deploy with docker

-1. Build agent docker image [Optional]
+### 1. Set up environment </br>

-> [!NOTE]
-> the step is optional. The docker images will be automatically pulled when running the docker compose commands. This step is only needed if pulling images failed.
-
-First, clone the opea GenAIComps repo.
+#### First, clone the `GenAIExamples` repo.

 ```
 export WORKDIR=<your-work-directory>
 cd $WORKDIR
-git clone https://github.com/opea-project/GenAIComps.git
+git clone https://github.com/opea-project/GenAIExamples.git
 ```

-Then build the agent docker image. Both the supervisor agent and the worker agent will use the same docker image, but when we launch the two agents we will specify different strategies and register different tools.
+#### Second, set up environment variables.
+
+##### For proxy environments only

 ```
-cd GenAIComps
-docker build -t opea/agent:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/agent/src/Dockerfile .
+export http_proxy="Your_HTTP_Proxy"
+export https_proxy="Your_HTTPs_Proxy"
+# Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+export no_proxy="Your_No_Proxy"
 ```

-2. Set up environment for this example </br>
+##### For using open-source llms

-   First, clone this repo.
+```
+export HUGGINGFACEHUB_API_TOKEN=<your-HF-token>
+export HF_CACHE_DIR=<directory-where-llms-are-downloaded> #so that no need to redownload every time
+```

-   ```
-   export WORKDIR=<your-work-directory>
-   cd $WORKDIR
-   git clone https://github.com/opea-project/GenAIExamples.git
-   ```
+##### [Optional] OPANAI_API_KEY to use OpenAI models

-   Second, set up env vars.
+```
+export OPENAI_API_KEY=<your-openai-key>
+```

-   ```
-   # Example: host_ip="192.168.1.1" or export host_ip="External_Public_IP"
-   export host_ip=$(hostname -I | awk '{print $1}')
-   # if you are in a proxy environment, also set the proxy-related environment variables
-   export http_proxy="Your_HTTP_Proxy"
-   export https_proxy="Your_HTTPs_Proxy"
-   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
-   export no_proxy="Your_No_Proxy"
+#### Third, set up environment variables for the selected hardware using the corresponding `set_env.sh`

-   export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
-   # for using open-source llms
-   export HUGGINGFACEHUB_API_TOKEN=<your-HF-token>
-   export HF_CACHE_DIR=<directory-where-llms-are-downloaded> #so that no need to redownload every time
+##### Gaudi

-   # optional: OPANAI_API_KEY if you want to use OpenAI models
-   export OPENAI_API_KEY=<your-openai-key>
-   ```
+```
+source $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+```

-3. Deploy the retrieval tool (i.e., DocIndexRetriever mega-service)
+##### Xeon

-   First, launch the mega-service.
+```
+source $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon/set_env.sh
+```

-   ```
-   cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool
-   bash launch_retrieval_tool.sh
-   ```
+### 3. Launch the multi-agent system. </br>

-   Then, ingest data into the vector database. Here we provide an example. You can ingest your own data.
+Two options are provided for the `llm_engine` of the agents: 1. open-source LLMs on Gaudi, 2. OpenAI models via API calls.

-   ```
-   bash run_ingest_data.sh
-   ```
+#### Gaudi

-4. Prepare SQL database
-   In this example, we will use the Chinook SQLite database. Run the commands below.
+On Gaudi, `meta-llama/Meta-Llama-3.1-70B-Instruct` will be served using vllm.
+By default, both the RAG agent and SQL agent will be launched to support the React Agent.  
+The React Agent requires the DocIndexRetriever's [`compose.yaml`](../DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml) file, so two `compose.yaml` files need to be run with docker compose to start the multi-agent system.

-   ```
-   # Download data
-   cd $WORKDIR
-   git clone https://github.com/lerocha/chinook-database.git
-   cp chinook-database/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite $WORKDIR/GenAIExamples/AgentQnA/tests/
-   ```
+> **Note**: To enable the web search tool, skip this step and proceed to the "[Optional] Web Search Tool Support" section.

-5. Launch other tools. </br>
-   In this example, we will use some of the mock APIs provided in the Meta CRAG KDD Challenge to demonstrate the benefits of gaining additional context from mock knowledge graphs.
+```bash
+cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/
+docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml up -d
+```

-   ```
-   docker run -d -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
-   ```
+##### [Optional] Web Search Tool Support

-6. Launch multi-agent system. </br>
-   We provide two options for `llm_engine` of the agents: 1. open-source LLMs on Intel Gaudi2, 2. OpenAI models via API calls.
+<details>
+<summary> Instructions </summary>
+A web search tool is supported in this example and can be enabled by running docker compose with the `compose.webtool.yaml` file.  
+The Google Search API is used. Follow the [instructions](https://python.langchain.com/docs/integrations/tools/google_search) to create an API key and enable the Custom Search API on a Google account. The environment variables `GOOGLE_CSE_ID` and `GOOGLE_API_KEY` need to be set.

-   ::::{tab-set}
-   :::{tab-item} Gaudi
-   :sync: Gaudi
+```bash
+cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/
+export GOOGLE_CSE_ID="YOUR_ID"
+export GOOGLE_API_KEY="YOUR_API_KEY"
+docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml -f compose.webtool.yaml up -d
+```

-   On Gaudi2 we will serve `meta-llama/Meta-Llama-3.1-70B-Instruct` using vllm.
+</details>

-   First build vllm-gaudi docker image.
+#### Xeon

-   ```bash
-   cd $WORKDIR
-   git clone https://github.com/vllm-project/vllm.git
-   cd ./vllm
-   git checkout v0.6.6
-   docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
-   ```
+On Xeon, only OpenAI models are supported.
+By default, both the RAG Agent and SQL Agent will be launched to support the React Agent.  
+The React Agent requires the DocIndexRetriever's [`compose.yaml`](../DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml) file, so two `compose yaml` files need to be run with docker compose to start the multi-agent system.

-   Then launch vllm on Gaudi2 with the command below.
+```bash
+export OPENAI_API_KEY=<your-openai-key>
+cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon
+docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose_openai.yaml up -d
+```

-   ```bash
-   vllm_port=8086
-   model="meta-llama/Meta-Llama-3.1-70B-Instruct"
-   docker run -d --runtime=habana --rm --name "vllm-gaudi-server" -e HABANA_VISIBLE_DEVICES=0,1,2,3 -p $vllm_port:8000 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-gaudi:latest --model ${model} --max-seq-len-to-capture 16384 --tensor-parallel-size 4
-   ```
+### 4. Ingest Data into the vector database

-   Then launch Agent microservices.
+The `run_ingest_data.sh` script will use an example jsonl file to ingest example documents into a vector database. Other ways to ingest data and other types of documents supported can be found in the OPEA dataprep microservice located in the opea-project/GenAIComps repo.

-   ```bash
-   cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/
-   bash launch_agent_service_gaudi.sh
-   ```
+```bash
+cd  $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool/
+bash run_ingest_data.sh
+```

-   :::
-   :::{tab-item} Xeon
-   :sync: Xeon
+> **Note**: This is a one-time operation.

-   To use OpenAI models, run commands below.
+## Launch the UI

-   ```
-   export OPENAI_API_KEY=<your-openai-key>
-   cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon
-   bash launch_agent_service_openai.sh
-   ```
+Open a web browser to http://localhost:5173 to access the UI. Ensure the environment variable `AGENT_URL` is set to http://$ip_address:9090/v1/chat/completions in [ui/svelte/.env](./ui/svelte/.env) or else the UI may not work properly.

-   :::
-   ::::
+The AgentQnA UI can be deployed locally or using Docker. To customize deployment, refer to the [AgentQnA UI Guide](./ui/svelte/README.md).

-## Deploy AgentQnA UI
-
-The AgentQnA UI can be deployed locally or using Docker.
-
-For detailed instructions on deploying AgentQnA UI, refer to the [AgentQnA UI Guide](./ui/svelte/README.md).
-
-## Deploy using Helm Chart
+## [Optional] Deploy using Helm Charts

 Refer to the [AgentQnA helm chart](./kubernetes/helm/README.md) for instructions on deploying AgentQnA on Kubernetes.

-## Validate services
+## Validate Services

-1. First look at logs of the agent docker containers:
+1. First look at logs for each of the agent docker containers:

-```
+```bash
 # worker RAG agent
 docker logs rag-agent-endpoint

 # worker SQL agent
 docker logs sql-agent-endpoint
-```

-```
 # supervisor agent
 docker logs react-agent-endpoint
 ```

-You should see something like "HTTP server setup successful" if the docker containers are started successfully.</p>
+Look for the message "HTTP server setup successful" to confirm the agent docker container has started successfully.</p>

-2. You can use python to validate the agent system
+2. Use python to validate each agent is working properly:

 ```bash
 # RAG worker agent
-python tests/test.py --prompt "Tell me about Michael Jackson song Thriller" --agent_role "worker" --ext_port 9095
+python $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "Tell me about Michael Jackson song Thriller" --agent_role "worker" --ext_port 9095

 # SQL agent
-python tests/test.py --prompt "How many employees in company" --agent_role "worker" --ext_port 9096
+python $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "How many employees in company" --agent_role "worker" --ext_port 9096

 # supervisor agent: this will test a two-turn conversation
-python tests/test.py --agent_role "supervisor" --ext_port 9090
+python $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --agent_role "supervisor" --ext_port 9090
 ```

-## How to register your own tools with agent
+## How to register other tools with the AI agent

-You can take a look at the tools yaml and python files in this example. For more details, please refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/src/README.md).
+The [tools](./tools) folder contains YAML and Python files for additional tools for the supervisor and worker agents. Refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/src/README.md) to add tools and customize the AI agents.
--- a/AgentQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/README.md
@@ -1,123 +1,3 @@
 # Single node on-prem deployment with Docker Compose on Xeon Scalable processors

-This example showcases a hierarchical multi-agent system for question-answering applications. We deploy the example on Xeon. For LLMs, we use OpenAI models via API calls. For instructions on using open-source LLMs, please refer to the deployment guide [here](../../../../README.md).
-
-## Deployment with docker
-
-1. First, clone this repo.
-   ```
-   export WORKDIR=<your-work-directory>
-   cd $WORKDIR
-   git clone https://github.com/opea-project/GenAIExamples.git
-   ```
-2. Set up environment for this example </br>
-
-   ```
-   # Example: host_ip="192.168.1.1" or export host_ip="External_Public_IP"
-   export host_ip=$(hostname -I | awk '{print $1}')
-   # if you are in a proxy environment, also set the proxy-related environment variables
-   export http_proxy="Your_HTTP_Proxy"
-   export https_proxy="Your_HTTPs_Proxy"
-   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
-   export no_proxy="Your_No_Proxy"
-
-   export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
-   #OPANAI_API_KEY if you want to use OpenAI models
-   export OPENAI_API_KEY=<your-openai-key>
-   ```
-
-3. Deploy the retrieval tool (i.e., DocIndexRetriever mega-service)
-
-   First, launch the mega-service.
-
-   ```
-   cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool
-   bash launch_retrieval_tool.sh
-   ```
-
-   Then, ingest data into the vector database. Here we provide an example. You can ingest your own data.
-
-   ```
-   bash run_ingest_data.sh
-   ```
-
-4. Prepare SQL database
-   In this example, we will use the SQLite database provided in the [TAG-Bench](https://github.com/TAG-Research/TAG-Bench/tree/main). Run the commands below.
-
-   ```
-   # Download data
-   cd $WORKDIR
-   git clone https://github.com/TAG-Research/TAG-Bench.git
-   cd TAG-Bench/setup
-   chmod +x get_dbs.sh
-   ./get_dbs.sh
-   ```
-
-5. Launch Tool service
-   In this example, we will use some of the mock APIs provided in the Meta CRAG KDD Challenge to demonstrate the benefits of gaining additional context from mock knowledge graphs.
-   ```
-   docker run -d -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
-   ```
-6. Launch multi-agent system
-
-   The configurations of the supervisor agent and the worker agents are defined in the docker-compose yaml file. We currently use OpenAI GPT-4o-mini as LLM.
-
-   ```
-   cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon
-   bash launch_agent_service_openai.sh
-   ```
-
-7. [Optional] Build `Agent` docker image if pulling images failed.
-
-   ```
-   git clone https://github.com/opea-project/GenAIComps.git
-   cd GenAIComps
-   docker build -t opea/agent:latest -f comps/agent/src/Dockerfile .
-   ```
-
-## Validate services
-
-First look at logs of the agent docker containers:
-
-```
-# worker RAG agent
-docker logs rag-agent-endpoint
-
-# worker SQL agent
-docker logs sql-agent-endpoint
-```
-
-```
-# supervisor agent
-docker logs react-agent-endpoint
-```
-
-You should see something like "HTTP server setup successful" if the docker containers are started successfully.</p>
-
-Second, validate worker RAG agent:
-
-```
-curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "Michael Jackson song Thriller"
-    }'
-```
-
-Third, validate worker SQL agent:
-
-```
-curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "How many employees are in the company?"
-    }'
-```
-
-Finally, validate supervisor agent:
-
-```
-curl http://${host_ip}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "How many albums does Iron Maiden have?"
-    }'
-```
-
-## How to register your own tools with agent
-
-You can take a look at the tools yaml and python files in this example. For more details, please refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/src/README.md).
+This example showcases a hierarchical multi-agent system for question-answering applications. To deploy the example on Xeon, OpenAI LLM models via API calls are used. For instructions, refer to the deployment guide [here](../../../../README.md).
--- a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
@@ -92,4 +92,23 @@ services:
      LANGCHAIN_PROJECT: "opea-supervisor-agent-service"
      CRAG_SERVER: $CRAG_SERVER
      WORKER_AGENT_URL: $WORKER_AGENT_URL
+      SQL_AGENT_URL: $SQL_AGENT_URL
      port: 9090
+  mock-api:
+    image: docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
+    container_name: mock-api
+    ports:
+      - "8080:8000"
+    ipc: host
+  agent-ui:
+    image: opea:agent-ui
+    container_name: agent-ui
+    volumes:
+      - ${WORKDIR}/GenAIExamples/AgentQnA/ui/svelte/.env:/home/user/svelte/.env # test db
+    ports:
+      - "5173:5173"
+    ipc: host
+
+networks:
+  default:
+    driver: bridge
--- a/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh
@@ -1,22 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-pushd "../../../../../" > /dev/null
-source .set_env.sh
-popd > /dev/null
-export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
-export ip_address=$(hostname -I | awk '{print $1}')
-export recursion_limit_worker=12
-export recursion_limit_supervisor=10
-export model="gpt-4o-mini-2024-07-18"
-export temperature=0
-export max_new_tokens=4096
-export OPENAI_API_KEY=${OPENAI_API_KEY}
-export WORKER_AGENT_URL="http://${ip_address}:9095/v1/chat/completions"
-export SQL_AGENT_URL="http://${ip_address}:9096/v1/chat/completions"
-export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool"
-export CRAG_SERVER=http://${ip_address}:8080
-export db_name=Chinook
-export db_path="sqlite:////home/user/chinook-db/Chinook_Sqlite.sqlite"
-
-docker compose -f compose_openai.yaml up -d
--- a/AgentQnA/docker_compose/intel/cpu/xeon/set_env.sh
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/set_env.sh
@@ -0,0 +1,57 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+pushd "../../../../../" > /dev/null
+source .set_env.sh
+popd > /dev/null
+
+if [[ -z "${WORKDIR}" ]]; then
+	echo "Please set WORKDIR environment variable"
+	exit 0
+fi
+echo "WORKDIR=${WORKDIR}"
+export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
+export ip_address=$(hostname -I | awk '{print $1}')
+export recursion_limit_worker=12
+export recursion_limit_supervisor=10
+export model="gpt-4o-mini-2024-07-18"
+export temperature=0
+export max_new_tokens=4096
+export OPENAI_API_KEY=${OPENAI_API_KEY}
+export WORKER_AGENT_URL="http://${ip_address}:9095/v1/chat/completions"
+export SQL_AGENT_URL="http://${ip_address}:9096/v1/chat/completions"
+export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool"
+export CRAG_SERVER=http://${ip_address}:8080
+export db_name=Chinook
+export db_path="sqlite:////home/user/chinook-db/Chinook_Sqlite.sqlite"
+
+if [ ! -f $WORKDIR/GenAIExamples/AgentQnA/tests/Chinook_Sqlite.sqlite ]; then
+    echo "Download Chinook_Sqlite!"
+    wget  -O $WORKDIR/GenAIExamples/AgentQnA/tests/Chinook_Sqlite.sqlite  https://github.com/lerocha/chinook-database/releases/download/v1.4.5/Chinook_Sqlite.sqlite
+fi
+
+# retriever
+export host_ip=$(hostname -I | awk '{print $1}')
+export HF_CACHE_DIR=${HF_CACHE_DIR}
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export no_proxy=${no_proxy}
+export http_proxy=${http_proxy}
+export https_proxy=${https_proxy}
+export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+export RERANK_MODEL_ID="BAAI/bge-reranker-base"
+export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:6006"
+export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808"
+export REDIS_URL="redis://${host_ip}:6379"
+export INDEX_NAME="rag-redis"
+export RERANK_TYPE="tei"
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export EMBEDDING_SERVICE_HOST_IP=${host_ip}
+export RETRIEVER_SERVICE_HOST_IP=${host_ip}
+export RERANK_SERVICE_HOST_IP=${host_ip}
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8889/v1/retrievaltool"
+export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
+export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6008/v1/dataprep/get"
+export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6009/v1/dataprep/delete"
+
+
+export no_proxy="$no_proxy,rag-agent-endpoint,sql-agent-endpoint,react-agent-endpoint,agent-ui"
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -1,147 +1,3 @@
 # Single node on-prem deployment AgentQnA on Gaudi

-This example showcases a hierarchical multi-agent system for question-answering applications. We deploy the example on Gaudi using open-source LLMs.
-For more details, please refer to the deployment guide [here](../../../../README.md).
-
-## Deployment with docker
-
-1. First, clone this repo.
-   ```
-   export WORKDIR=<your-work-directory>
-   cd $WORKDIR
-   git clone https://github.com/opea-project/GenAIExamples.git
-   ```
-2. Set up environment for this example </br>
-
-   ```
-   # Example: host_ip="192.168.1.1" or export host_ip="External_Public_IP"
-   export host_ip=$(hostname -I | awk '{print $1}')
-   # if you are in a proxy environment, also set the proxy-related environment variables
-   export http_proxy="Your_HTTP_Proxy"
-   export https_proxy="Your_HTTPs_Proxy"
-   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
-   export no_proxy="Your_No_Proxy"
-
-   export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
-   # for using open-source llms
-   export HUGGINGFACEHUB_API_TOKEN=<your-HF-token>
-   # Example export HF_CACHE_DIR=$WORKDIR so that no need to redownload every time
-   export HF_CACHE_DIR=<directory-where-llms-are-downloaded>
-
-   ```
-
-3. Deploy the retrieval tool (i.e., DocIndexRetriever mega-service)
-
-   First, launch the mega-service.
-
-   ```
-   cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool
-   bash launch_retrieval_tool.sh
-   ```
-
-   Then, ingest data into the vector database. Here we provide an example. You can ingest your own data.
-
-   ```
-   bash run_ingest_data.sh
-   ```
-
-4. Prepare SQL database
-   In this example, we will use the Chinook SQLite database. Run the commands below.
-
-   ```
-   # Download data
-   cd $WORKDIR
-   git clone https://github.com/lerocha/chinook-database.git
-   cp chinook-database/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite $WORKDIR/GenAIExamples/AgentQnA/tests/
-   ```
-
-5. Launch Tool service
-   In this example, we will use some of the mock APIs provided in the Meta CRAG KDD Challenge to demonstrate the benefits of gaining additional context from mock knowledge graphs.
-   ```
-   docker run -d -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
-   ```
-6. Launch multi-agent system
-
-   On Gaudi2 we will serve `meta-llama/Meta-Llama-3.1-70B-Instruct` using vllm.
-
-   First build vllm-gaudi docker image.
-
-   ```bash
-   cd $WORKDIR
-   git clone https://github.com/vllm-project/vllm.git
-   cd ./vllm
-   git checkout v0.6.6
-   docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
-   ```
-
-   Then launch vllm on Gaudi2 with the command below.
-
-   ```bash
-   vllm_port=8086
-   model="meta-llama/Meta-Llama-3.1-70B-Instruct"
-   docker run -d --runtime=habana --rm --name "vllm-gaudi-server" -e HABANA_VISIBLE_DEVICES=0,1,2,3 -p $vllm_port:8000 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-gaudi:latest --model ${model} --max-seq-len-to-capture 16384 --tensor-parallel-size 4
-   ```
-
-   Then launch Agent microservices.
-
-   ```bash
-   cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi/
-   bash launch_agent_service_gaudi.sh
-   ```
-
-7. [Optional] Build `Agent` docker image if pulling images failed.
-
-   If docker image pulling failed in Step 6 above, build the agent docker image with the commands below. After image build, try Step 6 again.
-
-   ```
-   git clone https://github.com/opea-project/GenAIComps.git
-   cd GenAIComps
-   docker build -t opea/agent:latest -f comps/agent/src/Dockerfile .
-   ```
-
-## Validate services
-
-First look at logs of the agent docker containers:
-
-```
-# worker RAG agent
-docker logs rag-agent-endpoint
-
-# worker SQL agent
-docker logs sql-agent-endpoint
-```
-
-```
-# supervisor agent
-docker logs react-agent-endpoint
-```
-
-You should see something like "HTTP server setup successful" if the docker containers are started successfully.</p>
-
-Second, validate worker RAG agent:
-
-```
-curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "Michael Jackson song Thriller"
-    }'
-```
-
-Third, validate worker SQL agent:
-
-```
-curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "How many employees are in the company?"
-    }'
-```
-
-Finally, validate supervisor agent:
-
-```
-curl http://${host_ip}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "How many albums does Iron Maiden have?"
-    }'
-```
-
-## How to register your own tools with agent
-
-You can take a look at the tools yaml and python files in this example. For more details, please refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/src/README.md).
+This example showcases a hierarchical multi-agent system for question-answering applications. To deploy the example on Gaudi using open-source LLMs, refer to the deployment guide [here](../../../../README.md).
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.webtool.yaml
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.webtool.yaml
@@ -0,0 +1,9 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  supervisor-react-agent:
+    environment:
+      - tools=/home/user/tools/supervisor_agent_webtools.yaml
+      - GOOGLE_CSE_ID=${GOOGLE_CSE_ID}
+      - GOOGLE_API_KEY=${GOOGLE_API_KEY}
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -97,3 +97,47 @@ services:
      WORKER_AGENT_URL: $WORKER_AGENT_URL
      SQL_AGENT_URL: $SQL_AGENT_URL
      port: 9090
+  mock-api:
+    image: docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
+    container_name: mock-api
+    ports:
+      - "8080:8000"
+    ipc: host
+  agent-ui:
+    image: opea/agent-ui
+    container_name: agent-ui
+    volumes:
+      - ${WORKDIR}/GenAIExamples/AgentQnA/ui/svelte/.env:/home/user/svelte/.env
+    environment:
+      host_ip: ${host_ip}
+    ports:
+      - "5173:5173"
+    ipc: host
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+    container_name: vllm-gaudi-server
+    ports:
+      - "8086:8000"
+    volumes:
+      - "./data:/data"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      VLLM_SKIP_WARMUP: true
+      PT_HPU_ENABLE_LAZY_COLLECTIVES: true
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://$host_ip:8086/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    command: --model $LLM_MODEL_ID --tensor-parallel-size 4 --host 0.0.0.0 --port 8000 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 16384
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_gaudi.sh
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_gaudi.sh
@@ -1,36 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-pushd "../../../../../" > /dev/null
-source .set_env.sh
-popd > /dev/null
-WORKPATH=$(dirname "$PWD")/..
-# export WORKDIR=$WORKPATH/../../
-echo "WORKDIR=${WORKDIR}"
-export ip_address=$(hostname -I | awk '{print $1}')
-export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-
-# LLM related environment variables
-export HF_CACHE_DIR=${HF_CACHE_DIR}
-ls $HF_CACHE_DIR
-export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export LLM_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct" #"meta-llama/Meta-Llama-3.1-70B-Instruct"
-export NUM_SHARDS=4
-export LLM_ENDPOINT_URL="http://${ip_address}:8086"
-export temperature=0
-export max_new_tokens=4096
-
-# agent related environment variables
-export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
-echo "TOOLSET_PATH=${TOOLSET_PATH}"
-export recursion_limit_worker=12
-export recursion_limit_supervisor=10
-export WORKER_AGENT_URL="http://${ip_address}:9095/v1/chat/completions"
-export SQL_AGENT_URL="http://${ip_address}:9096/v1/chat/completions"
-export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool"
-export CRAG_SERVER=http://${ip_address}:8080
-
-export db_name=Chinook
-export db_path="sqlite:////home/user/chinook-db/Chinook_Sqlite.sqlite"
-
-docker compose -f compose.yaml up -d
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/launch_tgi_gaudi.sh
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_tgi_gaudi.sh
@@ -1,25 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# LLM related environment variables
-export HF_CACHE_DIR=${HF_CACHE_DIR}
-ls $HF_CACHE_DIR
-export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-70B-Instruct"
-export NUM_SHARDS=4
-
-docker compose -f tgi_gaudi.yaml up -d
-
-sleep 5s
-echo "Waiting tgi gaudi ready"
-n=0
-until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do
-    docker logs tgi-server &> tgi-gaudi-service.log
-    n=$((n+1))
-    if grep -q Connected tgi-gaudi-service.log; then
-        break
-    fi
-    sleep 5s
-done
-sleep 5s
-echo "Service started successfully"
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -0,0 +1,69 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+pushd "../../../../../" > /dev/null
+source .set_env.sh
+popd > /dev/null
+WORKPATH=$(dirname "$PWD")/..
+# export WORKDIR=$WORKPATH/../../
+if [[ -z "${WORKDIR}" ]]; then
+	echo "Please set WORKDIR environment variable"
+	exit 0
+fi
+echo "WORKDIR=${WORKDIR}"
+export ip_address=$(hostname -I | awk '{print $1}')
+
+# LLM related environment variables
+export HF_CACHE_DIR=${HF_CACHE_DIR}
+ls $HF_CACHE_DIR
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export LLM_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct"
+export NUM_SHARDS=4
+export LLM_ENDPOINT_URL="http://${ip_address}:8086"
+export temperature=0
+export max_new_tokens=4096
+
+# agent related environment variables
+export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
+echo "TOOLSET_PATH=${TOOLSET_PATH}"
+export recursion_limit_worker=12
+export recursion_limit_supervisor=10
+export WORKER_AGENT_URL="http://${ip_address}:9095/v1/chat/completions"
+export SQL_AGENT_URL="http://${ip_address}:9096/v1/chat/completions"
+export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool"
+export CRAG_SERVER=http://${ip_address}:8080
+
+export db_name=Chinook
+export db_path="sqlite:////home/user/chinook-db/Chinook_Sqlite.sqlite"
+if [ ! -f $WORKDIR/GenAIExamples/AgentQnA/tests/Chinook_Sqlite.sqlite ]; then
+    echo "Download Chinook_Sqlite!"
+    wget  -O $WORKDIR/GenAIExamples/AgentQnA/tests/Chinook_Sqlite.sqlite  https://github.com/lerocha/chinook-database/releases/download/v1.4.5/Chinook_Sqlite.sqlite
+fi
+
+# configure agent ui
+echo "AGENT_URL = 'http://$ip_address:9090/v1/chat/completions'" | tee ${WORKDIR}/GenAIExamples/AgentQnA/ui/svelte/.env
+
+# retriever
+export host_ip=$(hostname -I | awk '{print $1}')
+export no_proxy=${no_proxy}
+export http_proxy=${http_proxy}
+export https_proxy=${https_proxy}
+export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+export RERANK_MODEL_ID="BAAI/bge-reranker-base"
+export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:6006"
+export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808"
+export REDIS_URL="redis://${host_ip}:6379"
+export INDEX_NAME="rag-redis"
+export RERANK_TYPE="tei"
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export EMBEDDING_SERVICE_HOST_IP=${host_ip}
+export RETRIEVER_SERVICE_HOST_IP=${host_ip}
+export RERANK_SERVICE_HOST_IP=${host_ip}
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8889/v1/retrievaltool"
+export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/ingest"
+export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6008/v1/dataprep/get"
+export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6009/v1/dataprep/delete"
+
+
+export no_proxy="$no_proxy,rag-agent-endpoint,sql-agent-endpoint,react-agent-endpoint,agent-ui,vllm-gaudi-server,jaeger,grafana,prometheus,127.0.0.1,localhost,0.0.0.0,$host_ip"
--- a/AgentQnA/tests/_test_compose_openai_on_xeon.sh
+++ b/AgentQnA/tests/_test_compose_openai_on_xeon.sh
@@ -20,23 +20,30 @@ function stop_agent_and_api_server() {

 function stop_retrieval_tool() {
    echo "Stopping Retrieval tool"
-    docker compose -f $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool/docker/docker-compose-retrieval-tool.yaml down
+    local RETRIEVAL_TOOL_PATH=$WORKPATH/../DocIndexRetriever
+    cd $RETRIEVAL_TOOL_PATH/docker_compose/intel/cpu/xeon/
+    container_list=$(cat compose.yaml | grep container_name | cut -d':' -f2)
+    for container_name in $container_list; do
+        cid=$(docker ps -aq --filter "name=$container_name")
+        echo "Stopping container $container_name"
+        if [[ ! -z "$cid" ]]; then docker rm $cid -f && sleep 1s; fi
+    done
 }

 echo "=================== #1 Building docker images===================="
-bash 1_build_images.sh
+bash step1_build_images.sh
 echo "=================== #1 Building docker images completed===================="

 echo "=================== #2 Start retrieval tool===================="
-bash 2_start_retrieval_tool.sh
+bash step2_start_retrieval_tool.sh
 echo "=================== #2 Retrieval tool started===================="

 echo "=================== #3 Ingest data and validate retrieval===================="
-bash 3_ingest_data_and_validate_retrieval.sh
+bash step3_ingest_data_and_validate_retrieval.sh
 echo "=================== #3 Data ingestion and validation completed===================="

 echo "=================== #4 Start agent and API server===================="
-bash 4_launch_and_validate_agent_openai.sh
+bash step4_launch_and_validate_agent_openai.sh
 echo "=================== #4 Agent test passed ===================="

 echo "=================== #5 Stop agent and API server===================="
--- a/AgentQnA/tests/step1_build_images.sh
+++ b/AgentQnA/tests/step1_build_images.sh
@@ -42,7 +42,8 @@ function build_vllm_docker_image() {
        git clone https://github.com/HabanaAI/vllm-fork.git
    fi
    cd ./vllm-fork
-    git checkout v0.6.4.post2+Gaudi-1.19.0
+    VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+    git checkout ${VLLM_VER} &> /dev/null
    docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:ci --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
    if [ $? -ne 0 ]; then
        echo "opea/vllm-gaudi:ci failed"
--- a/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
+++ b/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
@@ -27,18 +27,20 @@ vllm_volume=${HF_CACHE_DIR}
 function start_tgi(){
    echo "Starting tgi-gaudi server"
    cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi
-    bash launch_tgi_gaudi.sh
+    source set_env.sh
+    docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml tgi_gaudi.yaml up -d

 }

-function start_vllm_service_70B() {
+function start_all_services() {

    echo "token is ${HF_TOKEN}"

    echo "start vllm gaudi service"
    echo "**************model is $model**************"
-    vllm_image=opea/vllm-gaudi:ci
-    docker run -d --runtime=habana --rm --name "vllm-gaudi-server" -e HABANA_VISIBLE_DEVICES=0,1,2,3 -p $vllm_port:8000 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host $vllm_image --model ${model} --max-seq-len-to-capture 16384 --tensor-parallel-size 4
+    cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi
+    source set_env.sh
+    docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml up -d
    sleep 5s
    echo "Waiting vllm gaudi ready"
    n=0
@@ -67,15 +69,6 @@ function download_chinook_data(){
    cp chinook-database/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite $WORKDIR/GenAIExamples/AgentQnA/tests/
 }

-function start_agent_and_api_server() {
-    echo "Starting CRAG server"
-    docker run -d --runtime=runc --name=kdd-cup-24-crag-service -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
-
-    echo "Starting Agent services"
-    cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi
-    bash launch_agent_service_gaudi.sh
-    sleep 2m
-}

 function validate() {
    local CONTENT="$1"
@@ -95,8 +88,9 @@ function validate_agent_service() {
    # # test worker rag agent
    echo "======================Testing worker rag agent======================"
    export agent_port="9095"
+    export agent_ip="127.0.0.1"
    prompt="Tell me about Michael Jackson song Thriller"
-    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ext_port $agent_port)
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ip_addr $agent_ip  --ext_port $agent_port)
    # echo $CONTENT
    local EXIT_CODE=$(validate "$CONTENT" "Thriller" "rag-agent-endpoint")
    echo $EXIT_CODE
@@ -110,7 +104,7 @@ function validate_agent_service() {
    echo "======================Testing worker sql agent======================"
    export agent_port="9096"
    prompt="How many employees are there in the company?"
-    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ext_port $agent_port)
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ip_addr $agent_ip --ext_port $agent_port)
    local EXIT_CODE=$(validate "$CONTENT" "8" "sql-agent-endpoint")
    echo $CONTENT
    # echo $EXIT_CODE
@@ -123,7 +117,7 @@ function validate_agent_service() {
    # test supervisor react agent
    echo "======================Testing supervisor react agent======================"
    export agent_port="9090"
-    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --agent_role "supervisor" --ext_port $agent_port --stream)
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --agent_role "supervisor" --ip_addr $agent_ip  --ext_port $agent_port --stream)
    local EXIT_CODE=$(validate "$CONTENT" "Iron" "react-agent-endpoint")
    # echo $CONTENT
    echo $EXIT_CODE
@@ -144,18 +138,68 @@ function remove_chinook_data(){
    echo "Chinook data removed!"
 }

+export host_ip=$ip_address
+echo "ip_address=${ip_address}"
+
+
+function validate() {
+    local CONTENT="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+
+    if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+        echo "[ $SERVICE_NAME ] Content is as expected: $CONTENT"
+        echo 0
+    else
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+        echo 1
+    fi
+}
+
+function ingest_data_and_validate() {
+    echo "Ingesting data"
+    cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool/
+    echo $PWD
+    local CONTENT=$(bash run_ingest_data.sh)
+    local EXIT_CODE=$(validate "$CONTENT" "Data preparation succeeded" "dataprep-redis-server")
+    echo "$EXIT_CODE"
+    local EXIT_CODE="${EXIT_CODE:0-1}"
+    echo "return value is $EXIT_CODE"
+    if [ "$EXIT_CODE" == "1" ]; then
+        docker logs dataprep-redis-server
+        return 1
+    fi
+}
+
+function validate_retrieval_tool() {
+    echo "----------------Test retrieval tool ----------------"
+    local CONTENT=$(http_proxy="" curl http://${ip_address}:8889/v1/retrievaltool -X POST -H "Content-Type: application/json" -d '{
+     "text": "Who sang Thriller"
+    }')
+    local EXIT_CODE=$(validate "$CONTENT" "Thriller" "retrieval-tool")
+
+    if [ "$EXIT_CODE" == "1" ]; then
+        docker logs retrievaltool-xeon-backend-server
+        exit 1
+    fi
+}
+
 function main() {
    echo "==================== Prepare data ===================="
    download_chinook_data
    echo "==================== Data prepare done ===================="

-    echo "==================== Start VLLM service ===================="
-    start_vllm_service_70B
-    echo "==================== VLLM service started ===================="
+    echo "==================== Start all services ===================="
+    start_all_services
+    echo "==================== all services started ===================="

-    echo "==================== Start agent ===================="
-    start_agent_and_api_server
-    echo "==================== Agent started ===================="
+    echo "==================== Ingest data ===================="
+    ingest_data_and_validate
+    echo "==================== Data ingestion completed ===================="
+
+    echo "==================== Validate retrieval tool ===================="
+    validate_retrieval_tool
+    echo "==================== Retrieval tool validated ===================="

    echo "==================== Validate agent service ===================="
    validate_agent_service
--- a/AgentQnA/tests/step4_launch_and_validate_agent_openai.sh
+++ b/AgentQnA/tests/step4_launch_and_validate_agent_openai.sh
@@ -11,13 +11,22 @@ echo "WORKDIR=${WORKDIR}"
 export ip_address=$(hostname -I | awk '{print $1}')
 export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/

+
+function download_chinook_data(){
+    echo "Downloading chinook data..."
+    cd $WORKDIR
+    git clone https://github.com/lerocha/chinook-database.git
+    cp chinook-database/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite $WORKDIR/GenAIExamples/AgentQnA/tests/
+}
+
 function start_agent_and_api_server() {
    echo "Starting CRAG server"
    docker run -d --runtime=runc --name=kdd-cup-24-crag-service -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0

    echo "Starting Agent services"
-    cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon
+    cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon/
    bash launch_agent_service_openai.sh
+    sleep 2m
 }

 function validate() {
@@ -35,19 +44,64 @@ function validate() {
 }

 function validate_agent_service() {
-    echo "----------------Test agent ----------------"
-    local CONTENT=$(http_proxy="" curl http://${ip_address}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "query": "Tell me about Michael Jackson song thriller"
-    }')
-    local EXIT_CODE=$(validate "$CONTENT" "Thriller" "react-agent-endpoint")
-    docker logs react-agent-endpoint
+    # # test worker rag agent
+    echo "======================Testing worker rag agent======================"
+    export agent_port="9095"
+    prompt="Tell me about Michael Jackson song Thriller"
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ext_port $agent_port)
+    # echo $CONTENT
+    local EXIT_CODE=$(validate "$CONTENT" "Thriller" "rag-agent-endpoint")
+    echo $EXIT_CODE
+    local EXIT_CODE="${EXIT_CODE:0-1}"
    if [ "$EXIT_CODE" == "1" ]; then
+        docker logs rag-agent-endpoint
+        exit 1
+    fi
+
+    # # test worker sql agent
+    echo "======================Testing worker sql agent======================"
+    export agent_port="9096"
+    prompt="How many employees are there in the company?"
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ext_port $agent_port)
+    local EXIT_CODE=$(validate "$CONTENT" "8" "sql-agent-endpoint")
+    echo $CONTENT
+    # echo $EXIT_CODE
+    local EXIT_CODE="${EXIT_CODE:0-1}"
+    if [ "$EXIT_CODE" == "1" ]; then
+        docker logs sql-agent-endpoint
+        exit 1
+    fi
+
+    # test supervisor react agent
+    echo "======================Testing supervisor react agent======================"
+    export agent_port="9090"
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --agent_role "supervisor" --ext_port $agent_port --stream)
+    local EXIT_CODE=$(validate "$CONTENT" "Iron" "react-agent-endpoint")
+    # echo $CONTENT
+    echo $EXIT_CODE
+    local EXIT_CODE="${EXIT_CODE:0-1}"
+    if [ "$EXIT_CODE" == "1" ]; then
+        docker logs react-agent-endpoint
        exit 1
    fi

 }

+function remove_chinook_data(){
+    echo "Removing chinook data..."
+    cd $WORKDIR
+    if [ -d "chinook-database" ]; then
+        rm -rf chinook-database
+    fi
+    echo "Chinook data removed!"
+}
+
+
 function main() {
+    echo "==================== Prepare data ===================="
+    download_chinook_data
+    echo "==================== Data prepare done ===================="
+
    echo "==================== Start agent ===================="
    start_agent_and_api_server
    echo "==================== Agent started ===================="
@@ -57,4 +111,9 @@ function main() {
    echo "==================== Agent service validated ===================="
 }

+
+remove_chinook_data
+
 main
+
+remove_chinook_data
--- a/AgentQnA/tests/test_compose_on_gaudi.sh
+++ b/AgentQnA/tests/test_compose_on_gaudi.sh
@@ -9,6 +9,22 @@ echo "WORKDIR=${WORKDIR}"
 export ip_address=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
+export no_proxy="$no_proxy,rag-agent-endpoint,sql-agent-endpoint,react-agent-endpoint,agent-ui,vllm-gaudi-server,jaeger,grafana,prometheus,127.0.0.1,localhost,0.0.0.0,$ip_address"
+
+
+function get_genai_comps() {
+    if [ ! -d "GenAIComps" ] ; then
+        git clone --depth 1 --branch ${opea_branch:-"main"} https://github.com/opea-project/GenAIComps.git
+    fi
+}
+
+
+function build_agent_docker_image() {
+    cd $WORKDIR/GenAIExamples/AgentQnA/docker_image_build/
+    get_genai_comps
+    echo "Build agent image with --no-cache..."
+    docker compose -f build.yaml build --no-cache
+}

 function stop_crag() {
    cid=$(docker ps -aq --filter "name=kdd-cup-24-crag-service")
@@ -18,12 +34,7 @@ function stop_crag() {

 function stop_agent_docker() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi/
-    container_list=$(cat compose.yaml | grep container_name | cut -d':' -f2)
-    for container_name in $container_list; do
-        cid=$(docker ps -aq --filter "name=$container_name")
-        echo "Stopping container $container_name"
-        if [[ ! -z "$cid" ]]; then docker rm $cid -f && sleep 1s; fi
-    done
+    docker compose -f $WORKDIR/GenAIExamples/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml -f compose.yaml down
 }

 function stop_llm(){
@@ -59,33 +70,21 @@ function stop_retrieval_tool() {
 echo "workpath: $WORKPATH"
 echo "=================== Stop containers ===================="
 stop_crag
-stop_llm
 stop_agent_docker
-stop_retrieval_tool

 cd $WORKPATH/tests

 echo "=================== #1 Building docker images===================="
-bash step1_build_images.sh
+build_agent_docker_image
 echo "=================== #1 Building docker images completed===================="

-echo "=================== #2 Start retrieval tool===================="
-bash step2_start_retrieval_tool.sh
-echo "=================== #2 Retrieval tool started===================="
-
-echo "=================== #3 Ingest data and validate retrieval===================="
-bash step3_ingest_data_and_validate_retrieval.sh
-echo "=================== #3 Data ingestion and validation completed===================="
-
-echo "=================== #4 Start agent and API server===================="
-bash step4_launch_and_validate_agent_gaudi.sh
-echo "=================== #4 Agent test passed ===================="
+echo "=================== #4 Start agent, API server, retrieval, and ingest data===================="
+bash $WORKPATH/tests/step4_launch_and_validate_agent_gaudi.sh
+echo "=================== #4 Agent, retrieval test passed ===================="

 echo "=================== #5 Stop agent and API server===================="
 stop_crag
 stop_agent_docker
-stop_retrieval_tool
-stop_llm
 echo "=================== #5 Agent and API server stopped===================="

 echo y | docker system prune
--- a/AgentQnA/tools/supervisor_agent_webtools.yaml
+++ b/AgentQnA/tools/supervisor_agent_webtools.yaml
@@ -0,0 +1,77 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+search_web_base:
+  description: Search a web base for a given query. Returns text related to the query.
+  callable_api: tools.py:search_web_base
+  args_schema:
+    query:
+      type: str
+      description: query
+  return_output: retrieved_data
+
+search_knowledge_base:
+  description: Search a knowledge base for a given query. Returns text related to the query.
+  callable_api: tools.py:search_knowledge_base
+  args_schema:
+    query:
+      type: str
+      description: query
+  return_output: retrieved_data
+
+search_artist_database:
+  description: Search a SQL database on artists and their music with a natural language query. Returns text related to the query.
+  callable_api: tools.py:search_sql_database
+  args_schema:
+    query:
+      type: str
+      description: natural language query
+  return_output: retrieved_data
+
+get_artist_birth_place:
+  description: Get the birth place of an artist.
+  callable_api: tools.py:get_artist_birth_place
+  args_schema:
+    artist_name:
+      type: str
+      description: artist name
+  return_output: birth_place
+
+get_billboard_rank_date:
+  description: Get Billboard ranking for a specific rank and date.
+  callable_api: tools.py:get_billboard_rank_date
+  args_schema:
+    rank:
+      type: int
+      description: the rank of interest, for example 1 for top 1
+    date:
+      type: str
+      description: date
+  return_output: billboard_info
+
+get_song_release_date:
+  description: Get the release date of a song.
+  callable_api: tools.py:get_song_release_date
+  args_schema:
+    song_name:
+      type: str
+      description: song name
+  return_output: release_date
+
+get_members:
+  description: Get the member list of a band.
+  callable_api: tools.py:get_members
+  args_schema:
+    band_name:
+      type: str
+      description: band name
+  return_output: members
+
+get_grammy_best_artist_by_year:
+  description: Get the Grammy Best New Artist for a specific year.
+  callable_api: tools.py:get_grammy_best_artist_by_year
+  args_schema:
+    year:
+      type: int
+      description: year
+  return_output: grammy_best_new_artist
--- a/AgentQnA/tools/tools.py
+++ b/AgentQnA/tools/tools.py
@@ -7,6 +7,24 @@ import requests
 from tools.pycragapi import CRAG


+def search_web_base(query: str) -> str:
+    import os
+
+    from langchain_core.tools import Tool
+    from langchain_google_community import GoogleSearchAPIWrapper
+
+    search = GoogleSearchAPIWrapper()
+
+    tool = Tool(
+        name="google_search",
+        description="Search Google for recent results.",
+        func=search.run,
+    )
+
+    response = tool.run(query)
+    return response
+
+
 def search_knowledge_base(query: str) -> str:
    """Search a knowledge base about music and singers for a given query.

--- a/AgentQnA/ui/svelte/README.md
+++ b/AgentQnA/ui/svelte/README.md
@@ -21,10 +21,22 @@ Here're some of the project's features:
   cd AgentQnA/ui/svelte
   ```

-3. Modify the required .env variables.
+3. Modify the required .env variables. The `AGENT_URL` should be in the form of the following:

   ```
-   AGENT_URL = ''
+   AGENT_URL = "http://${ip_address}:${agent_port}/v1/chat/completions"
+   ```
+
+   For example: assume that the ip address of the host machine is 10.10.10.1, and the agent port is 9090,then
+
+   ```
+   AGENT_URL = "http://10.10.10.1:9090/v1/chat/completions"
+   ```
+
+   You can get the ip address of the host machine by running the command below:
+
+   ```bash
+    export ip_address=$(hostname -I | awk '{print $1}')
   ```

 4. **For Local Development:**
@@ -57,4 +69,4 @@ Here're some of the project's features:
  docker run -d -p 5173:5173 --name agent-ui opea:agent-ui
  ```

- The application will be available at `http://localhost:5173`.
+- The application will be available at `http://${ip_address}:5173`. You can access it with a web browser on your laptop. Note the `ip_address` should be the ip address of the host machine where the UI container runs.
--- a/AudioQnA/audioqna.py
+++ b/AudioQnA/audioqna.py
@@ -16,7 +16,7 @@ SPEECHT5_SERVER_HOST_IP = os.getenv("SPEECHT5_SERVER_HOST_IP", "0.0.0.0")
 SPEECHT5_SERVER_PORT = int(os.getenv("SPEECHT5_SERVER_PORT", 7055))
 LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
 LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 3006))
-LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")


 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
--- a/AudioQnA/audioqna_multilang.py
+++ b/AudioQnA/audioqna_multilang.py
@@ -17,7 +17,7 @@ GPT_SOVITS_SERVER_HOST_IP = os.getenv("GPT_SOVITS_SERVER_HOST_IP", "0.0.0.0")
 GPT_SOVITS_SERVER_PORT = int(os.getenv("GPT_SOVITS_SERVER_PORT", 9088))
 LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
 LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 8888))
-LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")


 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
--- a/AudioQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/README.md
@@ -2,6 +2,10 @@

 This document outlines the deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server.

+The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [Start the MegaService](#-start-the-megaservice) section in this page.
+
+Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
+
 ## 🚀 Build Docker images

 ### 1. Source Code install GenAIComps
@@ -17,9 +21,15 @@ cd GenAIComps
 docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .
 ```

-### 3. Build LLM Image
+### 3. Build vLLM Image

-Intel Xeon optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu (https://github.com/huggingface/text-generation-inference)
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd ./vllm/
+VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+git checkout ${VLLM_VER}
+docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.cpu -t opea/vllm:latest --shm-size=128g .
+```

 ### 4. Build TTS Image

@@ -43,9 +53,10 @@ docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_p
 Then run the command `docker images`, you will have following images ready:

 1. `opea/whisper:latest`
-2. `opea/speecht5:latest`
-3. `opea/audioqna:latest`
-4. `opea/gpt-sovits:latest` (optional)
+2. `opea/vllm:latest`
+3. `opea/speecht5:latest`
+4. `opea/audioqna:latest`
+5. `opea/gpt-sovits:latest` (optional)

 ## 🚀 Set the environment variables

@@ -55,7 +66,7 @@ Before starting the services with `docker compose`, you have to recheck the foll
 export host_ip=<your External Public IP>    # export host_ip=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=<your HF token>

-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"

 export MEGA_SERVICE_HOST_IP=${host_ip}
 export WHISPER_SERVER_HOST_IP=${host_ip}
@@ -73,40 +84,90 @@ export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna

 or use set_env.sh file to setup environment variables.

-Note: Please replace with host_ip with your external IP address, do not use localhost.
+Note:
+
+- Please replace with host_ip with your external IP address, do not use localhost.
+- If you are in a proxy environment, also set the proxy-related environment variables:
+
+```
+export http_proxy="Your_HTTP_Proxy"
+export https_proxy="Your_HTTPs_Proxy"
+# Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+export no_proxy="Your_No_Proxy",${host_ip},whisper-service,speecht5-service,gpt-sovits-service,tgi-service,vllm-service,audioqna-xeon-backend-server,audioqna-xeon-ui-server
+```

 ## 🚀 Start the MegaService

 ```bash
 cd GenAIExamples/AudioQnA/docker_compose/intel/cpu/xeon/
+```
+
+If use vLLM as the LLM serving backend:
+
+```
 docker compose up -d

 # multilang tts (optional)
 docker compose -f compose_multilang.yaml up -d
 ```

+If use TGI as the LLM serving backend:
+
+```
+docker compose -f compose_tgi.yaml up -d
+```
+
 ## 🚀 Test MicroServices

-```bash
-# whisper service
-wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
-curl http://${host_ip}:7066/v1/audio/transcriptions \
-  -H "Content-Type: multipart/form-data" \
-  -F file="@./sample.wav" \
-  -F model="openai/whisper-small"
+1. Whisper Service

-# tgi service
-curl http://${host_ip}:3006/generate \
-  -X POST \
-  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-  -H 'Content-Type: application/json'
+   ```bash
+   wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
+   curl http://${host_ip}:${WHISPER_SERVER_PORT}/v1/audio/transcriptions \
+     -H "Content-Type: multipart/form-data" \
+     -F file="@./sample.wav" \
+     -F model="openai/whisper-small"
+   ```

-# speecht5 service
-curl http://${host_ip}:7055/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
+2. LLM backend Service

-# gpt-sovits service (optional)
-curl http://${host_ip}:9880/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
-```
+   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready and the container (`vllm-service` or `tgi-service`) status shown via `docker ps` will be `healthy`. Before that, the status will be `health: starting`.
+
+   Or try the command below to check whether the LLM serving is ready.
+
+   ```bash
+   # vLLM service
+   docker logs vllm-service 2>&1 | grep complete
+   # If the service is ready, you will get the response like below.
+   INFO:     Application startup complete.
+   ```
+
+   ```bash
+   # TGI service
+   docker logs tgi-service | grep Connected
+   # If the service is ready, you will get the response like below.
+   2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
+   ```
+
+   Then try the `cURL` command below to validate services.
+
+   ```bash
+   # either vLLM or TGI service
+   curl http://${host_ip}:${LLM_SERVER_PORT}/v1/chat/completions \
+     -X POST \
+     -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
+     -H 'Content-Type: application/json'
+   ```
+
+3. TTS Service
+
+   ```
+   # speecht5 service
+   curl http://${host_ip}:${SPEECHT5_SERVER_PORT}/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
+
+   # gpt-sovits service (optional)
+   curl http://${host_ip}:${GPT_SOVITS_SERVER_PORT}/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
+   ```

 ## 🚀 Test MegaService

--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -6,7 +6,7 @@ services:
    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
    container_name: whisper-service
    ports:
-      - "7066:7066"
+      - ${WHISPER_SERVER_PORT:-7066}:7066
    ipc: host
    environment:
      no_proxy: ${no_proxy}
@@ -17,38 +17,41 @@ services:
    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
    container_name: speecht5-service
    ports:
-      - "7055:7055"
+      - ${SPEECHT5_SERVER_PORT:-7055}:7055
    ipc: host
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
    restart: unless-stopped
-  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
-    container_name: tgi-service
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+    container_name: vllm-service
    ports:
-      - "3006:80"
+      - ${LLM_SERVER_PORT:-3006}:80
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
-    shm_size: 1g
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
+    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://$host_ip:3006/health || exit 1"]
+      test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
-    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+    command: --model ${LLM_MODEL_ID} --host 0.0.0.0 --port 80
  audioqna-xeon-backend-server:
    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
    container_name: audioqna-xeon-backend-server
    depends_on:
      - whisper-service
-      - tgi-service
+      - vllm-service
      - speecht5-service
    ports:
      - "3008:8888"
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
@@ -6,7 +6,7 @@ services:
    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
    container_name: whisper-service
    ports:
-      - "7066:7066"
+      - ${WHISPER_SERVER_PORT:-7066}:7066
    ipc: host
    environment:
      no_proxy: ${no_proxy}
@@ -18,27 +18,35 @@ services:
    image: ${REGISTRY:-opea}/gpt-sovits:${TAG:-latest}
    container_name: gpt-sovits-service
    ports:
-      - "9880:9880"
+      - ${GPT_SOVITS_SERVER_PORT:-9880}:9880
    ipc: host
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
    restart: unless-stopped
-  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
-    container_name: tgi-service
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+    container_name: vllm-service
    ports:
-      - "3006:80"
+      - ${LLM_SERVER_PORT:-3006}:80
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
-    shm_size: 1g
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
+    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model ${LLM_MODEL_ID} --host 0.0.0.0 --port 80
  audioqna-xeon-backend-server:
    image: ${REGISTRY:-opea}/audioqna-multilang:${TAG:-latest}
    container_name: audioqna-xeon-backend-server
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -0,0 +1,87 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  whisper-service:
+    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+    container_name: whisper-service
+    ports:
+      - ${WHISPER_SERVER_PORT:-7066}:7066
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+  speecht5-service:
+    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
+    container_name: speecht5-service
+    ports:
+      - ${SPEECHT5_SERVER_PORT:-7055}:7055
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+  tgi-service:
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    container_name: tgi-service
+    ports:
+      - ${LLM_SERVER_PORT:-3006}:80
+    volumes:
+      - "${MODEL_CACHE:-./data}:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+  audioqna-xeon-backend-server:
+    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
+    container_name: audioqna-xeon-backend-server
+    depends_on:
+      - whisper-service
+      - tgi-service
+      - speecht5-service
+    ports:
+      - "3008:8888"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+      - WHISPER_SERVER_HOST_IP=${WHISPER_SERVER_HOST_IP}
+      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
+      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
+      - LLM_MODEL_ID=${LLM_MODEL_ID}
+      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
+      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
+    ipc: host
+    restart: always
+  audioqna-xeon-ui-server:
+    image: ${REGISTRY:-opea}/audioqna-ui:${TAG:-latest}
+    container_name: audioqna-xeon-ui-server
+    depends_on:
+      - audioqna-xeon-backend-server
+    ports:
+      - "5173:5173"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - CHAT_URL=${BACKEND_SERVICE_ENDPOINT}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/AudioQnA/docker_compose/intel/cpu/xeon/set_env.sh
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/set_env.sh
@@ -8,7 +8,7 @@ export host_ip=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 # <token>

-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"

 export MEGA_SERVICE_HOST_IP=${host_ip}
 export WHISPER_SERVER_HOST_IP=${host_ip}
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -2,6 +2,10 @@

 This document outlines the deployment process for a AudioQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server.

+The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [Start the MegaService](#-start-the-megaservice) section in this page.
+
+Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
+
 ## 🚀 Build Docker images

 ### 1. Source Code install GenAIComps
@@ -17,9 +21,13 @@ cd GenAIComps
 docker build -t opea/whisper-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile.intel_hpu .
 ```

-### 3. Build LLM Image
+### 3. Build vLLM Image

-Intel Xeon optimized image hosted in huggingface repo will be used for TGI service: ghcr.io/huggingface/tgi-gaudi:2.0.6 (https://github.com/huggingface/tgi-gaudi)
+git clone https://github.com/HabanaAI/vllm-fork.git
+cd vllm-fork/
+VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+git checkout ${VLLM_VER}
+docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g .

 ### 4. Build TTS Image

@@ -40,8 +48,9 @@ docker build --no-cache -t opea/audioqna:latest --build-arg https_proxy=$https_p
 Then run the command `docker images`, you will have following images ready:

 1. `opea/whisper-gaudi:latest`
-2. `opea/speecht5-gaudi:latest`
-3. `opea/audioqna:latest`
+2. `opea/vllm-gaudi:latest`
+3. `opea/speecht5-gaudi:latest`
+4. `opea/audioqna:latest`

 ## 🚀 Set the environment variables

@@ -51,7 +60,12 @@ Before starting the services with `docker compose`, you have to recheck the foll
 export host_ip=<your External Public IP>    # export host_ip=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=<your HF token>

-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+# set vLLM parameters
+export NUM_CARDS=1
+export BLOCK_SIZE=128
+export MAX_NUM_SEQS=256
+export MAX_SEQ_LEN_TO_CAPTURE=2048

 export MEGA_SERVICE_HOST_IP=${host_ip}
 export WHISPER_SERVER_HOST_IP=${host_ip}
@@ -65,37 +79,90 @@ export LLM_SERVER_PORT=3006
 export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
 ```

+or use set_env.sh file to setup environment variables.
+
+Note:
+
+- Please replace with host_ip with your external IP address, do not use localhost.
+- If you are in a proxy environment, also set the proxy-related environment variables:
+
+```
+export http_proxy="Your_HTTP_Proxy"
+export https_proxy="Your_HTTPs_Proxy"
+# Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+export no_proxy="Your_No_Proxy",${host_ip},whisper-service,speecht5-service,tgi-service,vllm-service,audioqna-gaudi-backend-server,audioqna-gaudi-ui-server
+```
+
 ## 🚀 Start the MegaService

 > **_NOTE:_** Users will need at least three Gaudi cards for AudioQnA.

 ```bash
 cd GenAIExamples/AudioQnA/docker_compose/intel/hpu/gaudi/
+```
+
+If use vLLM as the LLM serving backend:
+
+```
 docker compose up -d
 ```

+If use TGI as the LLM serving backend:
+
+```
+docker compose -f compose_tgi.yaml up -d
+```
+
 ## 🚀 Test MicroServices

-```bash
-# whisper service
-curl http://${host_ip}:7066/v1/asr \
-  -X POST \
-  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
-  -H 'Content-Type: application/json'
+1. Whisper Service

-# tgi service
-curl http://${host_ip}:3006/generate \
-  -X POST \
-  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-  -H 'Content-Type: application/json'
+   ```bash
+   curl http://${host_ip}:${WHISPER_SERVER_PORT}/v1/asr \
+     -X POST \
+     -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
+     -H 'Content-Type: application/json'
+   ```

-# speecht5 service
-curl http://${host_ip}:7055/v1/tts \
-  -X POST \
-  -d '{"text": "Who are you?"}' \
-  -H 'Content-Type: application/json'
+2. LLM backend Service

-```
+   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready and the container (`vllm-gaudi-service` or `tgi-gaudi-service`) status shown via `docker ps` will be `healthy`. Before that, the status will be `health: starting`.
+
+   Or try the command below to check whether the LLM serving is ready.
+
+   ```bash
+   # vLLM service
+   docker logs vllm-gaudi-service 2>&1 | grep complete
+   # If the service is ready, you will get the response like below.
+   INFO:     Application startup complete.
+   ```
+
+   ```bash
+   # TGI service
+   docker logs tgi-gaudi-service | grep Connected
+   # If the service is ready, you will get the response like below.
+   2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
+   ```
+
+   Then try the `cURL` command below to validate services.
+
+   ```bash
+   # either vLLM or TGI service
+   curl http://${host_ip}:${LLM_SERVER_PORT}/v1/chat/completions \
+     -X POST \
+     -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
+     -H 'Content-Type: application/json'
+   ```
+
+3. TTS Service
+
+   ```
+   # speecht5 service
+   curl http://${host_ip}:${SPEECHT5_SERVER_PORT}/v1/tts
+     -X POST \
+     -d '{"text": "Who are you?"}' \
+     -H 'Content-Type: application/json'
+   ```

 ## 🚀 Test MegaService

--- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -6,7 +6,7 @@ services:
    image: ${REGISTRY:-opea}/whisper-gaudi:${TAG:-latest}
    container_name: whisper-service
    ports:
-      - "7066:7066"
+      - ${WHISPER_SERVER_PORT:-7066}:7066
    ipc: host
    environment:
      no_proxy: ${no_proxy}
@@ -22,7 +22,7 @@ services:
    image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest}
    container_name: speecht5-service
    ports:
-      - "7055:7055"
+      - ${SPEECHT5_SERVER_PORT:-7055}:7055
    ipc: host
    environment:
      no_proxy: ${no_proxy}
@@ -34,28 +34,27 @@ services:
    cap_add:
      - SYS_NICE
    restart: unless-stopped
-  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
-    container_name: tgi-gaudi-server
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+    container_name: vllm-gaudi-service
    ports:
-      - "3006:80"
+      - ${LLM_SERVER_PORT:-3006}:80
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://$host_ip:3006/health || exit 1"]
+      test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
@@ -63,13 +62,13 @@ services:
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
  audioqna-gaudi-backend-server:
    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
    container_name: audioqna-gaudi-backend-server
    depends_on:
      - whisper-service
-      - tgi-service
+      - vllm-service
      - speecht5-service
    ports:
      - "3008:8888"
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -0,0 +1,108 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  whisper-service:
+    image: ${REGISTRY:-opea}/whisper-gaudi:${TAG:-latest}
+    container_name: whisper-service
+    ports:
+      - ${WHISPER_SERVER_PORT:-7066}:7066
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    restart: unless-stopped
+  speecht5-service:
+    image: ${REGISTRY:-opea}/speecht5-gaudi:${TAG:-latest}
+    container_name: speecht5-service
+    ports:
+      - ${SPEECHT5_SERVER_PORT:-7055}:7055
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    restart: unless-stopped
+  tgi-service:
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
+    container_name: tgi-gaudi-service
+    ports:
+      - ${LLM_SERVER_PORT:-3006}:80
+    volumes:
+      - "${MODEL_CACHE:-./data}:/data"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      ENABLE_HPU_GRAPH: true
+      LIMIT_HPU_GRAPH: true
+      USE_FLASH_ATTENTION: true
+      FLASH_ATTENTION_RECOMPUTE: true
+      LLM_SERVER_PORT: ${LLM_SERVER_PORT}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://$host_ip:${LLM_SERVER_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
+  audioqna-gaudi-backend-server:
+    image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
+    container_name: audioqna-gaudi-backend-server
+    depends_on:
+      - whisper-service
+      - tgi-service
+      - speecht5-service
+    ports:
+      - "3008:8888"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+      - WHISPER_SERVER_HOST_IP=${WHISPER_SERVER_HOST_IP}
+      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
+      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
+      - LLM_MODEL_ID=${LLM_MODEL_ID}
+      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
+      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
+    ipc: host
+    restart: always
+  audioqna-gaudi-ui-server:
+    image: ${REGISTRY:-opea}/audioqna-ui:${TAG:-latest}
+    container_name: audioqna-gaudi-ui-server
+    depends_on:
+      - audioqna-gaudi-backend-server
+    ports:
+      - "5173:5173"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - CHAT_URL=${BACKEND_SERVICE_ENDPOINT}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -8,7 +8,13 @@ export host_ip=$(hostname -I | awk '{print $1}')
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 # <token>

-export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+
+# set vLLM parameters
+export NUM_CARDS=1
+export BLOCK_SIZE=128
+export MAX_NUM_SEQS=256
+export MAX_SEQ_LEN_TO_CAPTURE=2048

 export MEGA_SERVICE_HOST_IP=${host_ip}
 export WHISPER_SERVER_HOST_IP=${host_ip}
--- a/AudioQnA/docker_image_build/build.yaml
+++ b/AudioQnA/docker_image_build/build.yaml
@@ -71,3 +71,15 @@ services:
      dockerfile: comps/tts/src/integrations/dependency/gpt-sovits/Dockerfile
    extends: audioqna
    image: ${REGISTRY:-opea}/gpt-sovits:${TAG:-latest}
+  vllm:
+    build:
+      context: vllm
+      dockerfile: Dockerfile.cpu
+    extends: audioqna
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+  vllm-gaudi:
+    build:
+      context: vllm-fork
+      dockerfile: Dockerfile.hpu
+    extends: audioqna
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
--- a/AudioQnA/tests/test_compose_on_gaudi.sh
+++ b/AudioQnA/tests/test_compose_on_gaudi.sh
@@ -31,18 +31,27 @@ function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git

+    git clone https://github.com/HabanaAI/vllm-fork.git
+    cd vllm-fork/
+    VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+    echo "Check out vLLM tag ${VLLM_VER}"
+    git checkout ${VLLM_VER} &> /dev/null && cd ../
+
    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi"
+    service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi vllm-gaudi"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
    docker images && sleep 1s
 }

 function start_services() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+    export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
+    export NUM_CARDS=1
+    export BLOCK_SIZE=128
+    export MAX_NUM_SEQS=256
+    export MAX_SEQ_LEN_TO_CAPTURE=2048

    export MEGA_SERVICE_HOST_IP=${ip_address}
    export WHISPER_SERVER_HOST_IP=${ip_address}
@@ -61,8 +70,8 @@ function start_services() {
    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
    until [[ "$n" -ge 200 ]]; do
-       docker logs tgi-gaudi-server > $LOG_PATH/tgi_service_start.log
-       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+       docker logs vllm-gaudi-service > $LOG_PATH/vllm_service_start.log 2>&1
+       if grep -q complete $LOG_PATH/vllm_service_start.log; then
           break
       fi
       sleep 5s
@@ -86,7 +95,7 @@ function validate_megaservice() {
    # always print the log
    docker logs whisper-service > $LOG_PATH/whisper-service.log
    docker logs speecht5-service > $LOG_PATH/tts-service.log
-    docker logs tgi-gaudi-server > $LOG_PATH/tgi-gaudi-server.log
+    docker logs vllm-gaudi-service > $LOG_PATH/vllm-gaudi-service.log
    docker logs audioqna-gaudi-backend-server > $LOG_PATH/audioqna-gaudi-backend-server.log
    echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3

@@ -126,7 +135,7 @@ function validate_megaservice() {

 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    docker compose stop && docker compose rm -f
+    docker compose -f compose.yaml stop && docker compose rm -f
 }

 function main() {
--- a/AudioQnA/tests/test_compose_on_xeon.sh
+++ b/AudioQnA/tests/test_compose_on_xeon.sh
@@ -31,18 +31,23 @@ function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git

+    git clone https://github.com/vllm-project/vllm.git
+    cd ./vllm/
+    VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+    echo "Check out vLLM tag ${VLLM_VER}"
+    git checkout ${VLLM_VER} &> /dev/null && cd ../
+
    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="audioqna audioqna-ui whisper speecht5"
+    service_list="audioqna audioqna-ui whisper speecht5 vllm"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
    docker images && sleep 1s
 }

 function start_services() {
    cd $WORKPATH/docker_compose/intel/cpu/xeon/
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export LLM_MODEL_ID=Intel/neural-chat-7b-v3-3
+    export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct

    export MEGA_SERVICE_HOST_IP=${ip_address}
    export WHISPER_SERVER_HOST_IP=${ip_address}
@@ -62,8 +67,8 @@ function start_services() {
    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
    until [[ "$n" -ge 200 ]]; do
-       docker logs tgi-service > $LOG_PATH/tgi_service_start.log
-       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+       docker logs vllm-service > $LOG_PATH/vllm_service_start.log 2>&1
+       if grep -q complete $LOG_PATH/vllm_service_start.log; then
           break
       fi
       sleep 5s
@@ -77,7 +82,7 @@ function validate_megaservice() {
    # always print the log
    docker logs whisper-service > $LOG_PATH/whisper-service.log
    docker logs speecht5-service > $LOG_PATH/tts-service.log
-    docker logs tgi-service > $LOG_PATH/tgi-service.log
+    docker logs vllm-service > $LOG_PATH/vllm-service.log
    docker logs audioqna-xeon-backend-server > $LOG_PATH/audioqna-xeon-backend-server.log
    echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3

@@ -117,7 +122,7 @@ function validate_megaservice() {

 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/cpu/xeon/
-    docker compose stop && docker compose rm -f
+    docker compose -f compose.yaml stop && docker compose rm -f
 }

 function main() {
--- a/AudioQnA/tests/test_compose_tgi_on_gaudi.sh
+++ b/AudioQnA/tests/test_compose_tgi_on_gaudi.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi
+
+    cd $WORKPATH/docker_image_build
+    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="audioqna audioqna-ui whisper-gaudi speecht5-gaudi"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+    docker images && sleep 1s
+}
+
+function start_services() {
+    cd $WORKPATH/docker_compose/intel/hpu/gaudi
+    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+    export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
+
+    export MEGA_SERVICE_HOST_IP=${ip_address}
+    export WHISPER_SERVER_HOST_IP=${ip_address}
+    export SPEECHT5_SERVER_HOST_IP=${ip_address}
+    export LLM_SERVER_HOST_IP=${ip_address}
+
+    export WHISPER_SERVER_PORT=7066
+    export SPEECHT5_SERVER_PORT=7055
+    export LLM_SERVER_PORT=3006
+
+    export BACKEND_SERVICE_ENDPOINT=http://${ip_address}:3008/v1/audioqna
+    export host_ip=${ip_address}
+    # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+
+    # Start Docker Containers
+    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+    n=0
+    until [[ "$n" -ge 200 ]]; do
+       docker logs tgi-gaudi-service > $LOG_PATH/tgi_service_start.log
+       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+           break
+       fi
+       sleep 5s
+       n=$((n+1))
+    done
+
+    n=0
+    until [[ "$n" -ge 100 ]]; do
+       docker logs whisper-service > $LOG_PATH/whisper_service_start.log
+       if grep -q "Uvicorn server setup on port" $LOG_PATH/whisper_service_start.log; then
+           break
+       fi
+       sleep 5s
+       n=$((n+1))
+    done
+}
+
+
+function validate_megaservice() {
+    response=$(http_proxy="" curl http://${ip_address}:3008/v1/audioqna -XPOST -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64}' -H 'Content-Type: application/json')
+    # always print the log
+    docker logs whisper-service > $LOG_PATH/whisper-service.log
+    docker logs speecht5-service > $LOG_PATH/tts-service.log
+    docker logs tgi-gaudi-service > $LOG_PATH/tgi-gaudi-service.log
+    docker logs audioqna-gaudi-backend-server > $LOG_PATH/audioqna-gaudi-backend-server.log
+    echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
+
+    if [[ $(file speech.mp3) == *"RIFF"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong."
+        exit 1
+    fi
+
+}
+
+#function validate_frontend() {
+#    cd $WORKPATH/ui/svelte
+#    local conda_env_name="OPEA_e2e"
+#    export PATH=${HOME}/miniforge3/bin/:$PATH
+##    conda remove -n ${conda_env_name} --all -y
+##    conda create -n ${conda_env_name} python=3.12 -y
+#    source activate ${conda_env_name}
+#
+#    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+#
+##    conda install -c conda-forge nodejs=22.6.0 -y
+#    npm install && npm ci && npx playwright install --with-deps
+#    node -v && npm -v && pip list
+#
+#    exit_status=0
+#    npx playwright test || exit_status=$?
+#
+#    if [ $exit_status -ne 0 ]; then
+#        echo "[TEST INFO]: ---------frontend test failed---------"
+#        exit $exit_status
+#    else
+#        echo "[TEST INFO]: ---------frontend test passed---------"
+#    fi
+#}
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/intel/hpu/gaudi
+    docker compose -f compose_tgi.yaml stop && docker compose rm -f
+}
+
+function main() {
+
+    stop_docker
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_services
+
+    validate_megaservice
+    # validate_frontend
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/AudioQnA/tests/test_compose_tgi_on_xeon.sh
+++ b/AudioQnA/tests/test_compose_tgi_on_xeon.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi
+
+    cd $WORKPATH/docker_image_build
+    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="audioqna audioqna-ui whisper speecht5"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    docker images && sleep 1s
+}
+
+function start_services() {
+    cd $WORKPATH/docker_compose/intel/cpu/xeon/
+    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+    export LLM_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
+
+    export MEGA_SERVICE_HOST_IP=${ip_address}
+    export WHISPER_SERVER_HOST_IP=${ip_address}
+    export SPEECHT5_SERVER_HOST_IP=${ip_address}
+    export LLM_SERVER_HOST_IP=${ip_address}
+
+    export WHISPER_SERVER_PORT=7066
+    export SPEECHT5_SERVER_PORT=7055
+    export LLM_SERVER_PORT=3006
+
+    export BACKEND_SERVICE_ENDPOINT=http://${ip_address}:3008/v1/audioqna
+    export host_ip=${ip_address}
+
+    # sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+
+    # Start Docker Containers
+    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+    n=0
+    until [[ "$n" -ge 200 ]]; do
+       docker logs tgi-service > $LOG_PATH/tgi_service_start.log
+       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+           break
+       fi
+       sleep 5s
+       n=$((n+1))
+    done
+}
+
+
+function validate_megaservice() {
+    response=$(http_proxy="" curl http://${ip_address}:3008/v1/audioqna -XPOST -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64}' -H 'Content-Type: application/json')
+    # always print the log
+    docker logs whisper-service > $LOG_PATH/whisper-service.log
+    docker logs speecht5-service > $LOG_PATH/tts-service.log
+    docker logs tgi-service > $LOG_PATH/tgi-service.log
+    docker logs audioqna-xeon-backend-server > $LOG_PATH/audioqna-xeon-backend-server.log
+    echo "$response" | sed 's/^"//;s/"$//' | base64 -d > speech.mp3
+
+    if [[ $(file speech.mp3) == *"RIFF"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong."
+        exit 1
+    fi
+
+}
+
+#function validate_frontend() {
+#    cd $WORKPATH/ui/svelte
+#    local conda_env_name="OPEA_e2e"
+#    export PATH=${HOME}/miniforge3/bin/:$PATH
+##    conda remove -n ${conda_env_name} --all -y
+##    conda create -n ${conda_env_name} python=3.12 -y
+#    source activate ${conda_env_name}
+#
+#    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+#
+##    conda install -c conda-forge nodejs=22.6.0 -y
+#    npm install && npm ci && npx playwright install --with-deps
+#    node -v && npm -v && pip list
+#
+#    exit_status=0
+#    npx playwright test || exit_status=$?
+#
+#    if [ $exit_status -ne 0 ]; then
+#        echo "[TEST INFO]: ---------frontend test failed---------"
+#        exit $exit_status
+#    else
+#        echo "[TEST INFO]: ---------frontend test passed---------"
+#    fi
+#}
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/intel/cpu/xeon/
+    docker compose -f compose_tgi.yaml stop && docker compose rm -f
+}
+
+function main() {
+
+    stop_docker
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_services
+
+    validate_megaservice
+    # validate_frontend
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/ChatQnA/Dockerfile
+++ b/ChatQnA/Dockerfile
@@ -1,49 +1,10 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p $HOME && \
-    chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG

 COPY ./chatqna.py $HOME/chatqna.py
+COPY ./entrypoint.sh $HOME/entrypoint.sh

-ENTRYPOINT ["python", "chatqna.py"]
+ENTRYPOINT ["bash", "entrypoint.sh"]
--- a/ChatQnA/Dockerfile.guardrails
+++ b/ChatQnA/Dockerfile.guardrails
@@ -1,49 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p $HOME && \
-    chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
-
-COPY ./chatqna.py $HOME/chatqna.py
-
-ENTRYPOINT ["python", "chatqna.py", "--with-guardrails"]
--- a/ChatQnA/Dockerfile.without_rerank
+++ b/ChatQnA/Dockerfile.without_rerank
@@ -1,49 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p $HOME && \
-    chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
-
-COPY ./chatqna.py $HOME/chatqna.py
-
-ENTRYPOINT ["python", "chatqna.py", "--without-rerank"]
--- a/ChatQnA/README.md
+++ b/ChatQnA/README.md
@@ -70,11 +70,11 @@ To set up environment variables for deploying ChatQnA services, follow these ste
   # on Gaudi
   cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
   source ./set_env.sh
-   export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails
+   export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails,jaeger,prometheus,grafana,gaudi-node-exporter-1
   # on Xeon
   cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
   source ./set_env.sh
-   export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service
+   export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,jaeger,prometheus,grafana,xeon-node-exporter-1
   # on Nvidia GPU
   cd GenAIExamples/ChatQnA/docker_compose/nvidia/gpu
   source ./set_env.sh
--- a/ChatQnA/benchmark/accuracy_faqgen/README.md
+++ b/ChatQnA/benchmark/accuracy_faqgen/README.md
--- a/ChatQnA/benchmark/accuracy_faqgen/evaluate.py
+++ b/ChatQnA/benchmark/accuracy_faqgen/evaluate.py
--- a/ChatQnA/benchmark/accuracy_faqgen/generate_FAQ.py
+++ b/ChatQnA/benchmark/accuracy_faqgen/generate_FAQ.py
--- a/ChatQnA/benchmark/accuracy_faqgen/get_context.py
+++ b/ChatQnA/benchmark/accuracy_faqgen/get_context.py
--- a/ChatQnA/benchmark/accuracy_faqgen/launch_tgi.sh
+++ b/ChatQnA/benchmark/accuracy_faqgen/launch_tgi.sh
--- a/ChatQnA/benchmark/accuracy_faqgen/post_process_FAQ.py
+++ b/ChatQnA/benchmark/accuracy_faqgen/post_process_FAQ.py
--- a/ChatQnA/benchmark/accuracy_faqgen/run_acc.sh
+++ b/ChatQnA/benchmark/accuracy_faqgen/run_acc.sh
--- a/ChatQnA/benchmark/performance_faqgen/README.md
+++ b/ChatQnA/benchmark/performance_faqgen/README.md
--- a/ChatQnA/benchmark/performance_faqgen/benchmark.sh
+++ b/ChatQnA/benchmark/performance_faqgen/benchmark.sh
--- a/ChatQnA/benchmark/performance_faqgen/benchmark.yaml
+++ b/ChatQnA/benchmark/performance_faqgen/benchmark.yaml
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -159,7 +159,10 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
        next_data["inputs"] = prompt

    elif self.services[cur_node].service_type == ServiceType.LLM and not llm_parameters_dict["stream"]:
-        next_data["text"] = data["choices"][0]["message"]["content"]
+        if "faqgen" in self.services[cur_node].endpoint:
+            next_data = data
+        else:
+            next_data["text"] = data["choices"][0]["message"]["content"]
    else:
        next_data = data

@@ -178,7 +181,12 @@ def align_generator(self, gen, **kwargs):
        try:
            # sometimes yield empty chunk, do a fallback here
            json_data = json.loads(json_str)
-            if (
+            if "ops" in json_data and "op" in json_data["ops"][0]:
+                if "value" in json_data["ops"][0] and isinstance(json_data["ops"][0]["value"], str):
+                    yield f"data: {repr(json_data['ops'][0]['value'].encode('utf-8'))}\n\n"
+                else:
+                    pass
+            elif (
                json_data["choices"][0]["finish_reason"] != "eos_token"
                and "content" in json_data["choices"][0]["delta"]
            ):
@@ -329,6 +337,48 @@ class ChatQnAService:
        self.megaservice.flow_to(rerank, llm)
        # self.megaservice.flow_to(llm, guardrail_out)

+    def add_remote_service_faqgen(self):
+
+        embedding = MicroService(
+            name="embedding",
+            host=EMBEDDING_SERVER_HOST_IP,
+            port=EMBEDDING_SERVER_PORT,
+            endpoint="/embed",
+            use_remote_service=True,
+            service_type=ServiceType.EMBEDDING,
+        )
+
+        retriever = MicroService(
+            name="retriever",
+            host=RETRIEVER_SERVICE_HOST_IP,
+            port=RETRIEVER_SERVICE_PORT,
+            endpoint="/v1/retrieval",
+            use_remote_service=True,
+            service_type=ServiceType.RETRIEVER,
+        )
+
+        rerank = MicroService(
+            name="rerank",
+            host=RERANK_SERVER_HOST_IP,
+            port=RERANK_SERVER_PORT,
+            endpoint="/rerank",
+            use_remote_service=True,
+            service_type=ServiceType.RERANK,
+        )
+
+        llm = MicroService(
+            name="llm",
+            host=LLM_SERVER_HOST_IP,
+            port=LLM_SERVER_PORT,
+            endpoint="/v1/faqgen",
+            use_remote_service=True,
+            service_type=ServiceType.LLM,
+        )
+        self.megaservice.add(embedding).add(retriever).add(rerank).add(llm)
+        self.megaservice.flow_to(embedding, retriever)
+        self.megaservice.flow_to(retriever, rerank)
+        self.megaservice.flow_to(rerank, llm)
+
    async def handle_request(self, request: Request):
        data = await request.json()
        stream_opt = data.get("stream", True)
@@ -344,6 +394,7 @@ class ChatQnAService:
            repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
            stream=stream_opt,
            chat_template=chat_request.chat_template if chat_request.chat_template else None,
+            model=chat_request.model if chat_request.model else None,
        )
        retriever_parameters = RetrieverParms(
            search_type=chat_request.search_type if chat_request.search_type else "similarity",
@@ -399,6 +450,7 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--without-rerank", action="store_true")
    parser.add_argument("--with-guardrails", action="store_true")
+    parser.add_argument("--faqgen", action="store_true")

    args = parser.parse_args()

@@ -407,6 +459,8 @@ if __name__ == "__main__":
        chatqna.add_remote_service_without_rerank()
    elif args.with_guardrails:
        chatqna.add_remote_service_with_guardrails()
+    elif args.faqgen:
+        chatqna.add_remote_service_faqgen()
    else:
        chatqna.add_remote_service()

--- a/ChatQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/README.md
@@ -105,7 +105,15 @@ docker build --no-cache -t opea/retriever:latest --build-arg https_proxy=$https_
 docker build --no-cache -t opea/dataprep:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/src/Dockerfile .
 ```

-### 4. Build MegaService Docker Image
+### 4. Build FaqGen LLM Image (Optional)
+
+If you want to enable FAQ generation LLM in the pipeline, please use the below command:
+
+```bash
+docker build -t opea/llm-faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/faq-generation/Dockerfile .
+```
+
+### 5. Build MegaService Docker Image

 To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `chatqna.py` Python script. Build the MegaService Docker image using the command below:

@@ -116,7 +124,7 @@ docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_pr
 cd ../../..
 ```

-### 5. Build UI Docker Image
+### 6. Build UI Docker Image

 Construct the frontend Docker image using the command below:

@@ -126,7 +134,7 @@ docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https
 cd ../../../..
 ```

-### 6. Build React UI Docker Image (Optional)
+### 7. Build React UI Docker Image (Optional)

 Construct the frontend Docker image using the command below:

@@ -136,7 +144,7 @@ docker build --no-cache -t opea/chatqna-react-ui:latest --build-arg https_proxy=
 cd ../../../..
 ```

-### 7. Build Nginx Docker Image
+### 8. Build Nginx Docker Image

 ```bash
 cd GenAIComps
@@ -151,6 +159,10 @@ Then run the command `docker images`, you will have the following 5 Docker Image
 4. `opea/chatqna-ui:latest` or `opea/chatqna-react-ui:latest`
 5. `opea/nginx:latest`

+If FaqGen docker image is built, you will find one more image:
+
+- `opea/llm-faqgen:latest`
+
 ## 🚀 Start MicroServices and MegaService

 ### Required Models
@@ -190,6 +202,7 @@ Change the `xxx_MODEL_ID` below for your needs.
   export CHATQNA_REDIS_VECTOR_INSIGHT_PORT=8001
   export CHATQNA_REDIS_DATAPREP_PORT=6007
   export CHATQNA_REDIS_RETRIEVER_PORT=7000
+   export CHATQNA_LLM_FAQGEN_PORT=9000
   export CHATQNA_INDEX_NAME="rag-redis"
   export CHATQNA_MEGA_SERVICE_HOST_IP=${HOST_IP}
   export CHATQNA_RETRIEVER_SERVICE_HOST_IP=${HOST_IP}
@@ -246,7 +259,10 @@ Please find more information about accessing and restricting AMD GPUs in the lin

 ```bash
 cd GenAIExamples/ChatQnA/docker_compose/amd/gpu/rocm
+## for text generation
 docker compose up -d
+## for FAQ generation
+docker compose -f compose_faqgen.yaml up -d
 ```

 ### Validate MicroServices and MegaService
@@ -310,7 +326,16 @@ docker compose up -d
     -H 'Content-Type: application/json'
   ```

-5. MegaService
+5. FaqGen LLM Microservice (if enabled)
+
+```bash
+curl http://${host_ip}:${CHATQNA_LLM_FAQGEN_PORT}/v1/faqgen \
+  -X POST \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
+  -H 'Content-Type: application/json'
+```
+
+6. MegaService

   ```bash
   curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
@@ -318,7 +343,7 @@ docker compose up -d
        }'
   ```

-6. Nginx Service
+7. Nginx Service

   ```bash
   curl http://${host_ip}:${NGINX_PORT}/v1/chatqna \
@@ -326,7 +351,7 @@ docker compose up -d
       -d '{"messages": "What is the revenue of Nike in 2023?"}'
   ```

-7. Dataprep Microservice（Optional）
+8. Dataprep Microservice（Optional）

 If you want to update the default knowledge base, you can use the following commands:

--- a/ChatQnA/docker_compose/amd/gpu/rocm/compose_faqgen.yaml
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/compose_faqgen.yaml
@@ -0,0 +1,205 @@
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  chatqna-redis-vector-db:
+    image: redis/redis-stack:7.2.0-v9
+    container_name: redis-vector-db
+    ports:
+      - "${CHATQNA_REDIS_VECTOR_PORT}:6379"
+      - "${CHATQNA_REDIS_VECTOR_INSIGHT_PORT}:8001"
+  chatqna-dataprep-redis-service:
+    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
+    container_name: dataprep-redis-server
+    depends_on:
+      - chatqna-redis-vector-db
+      - chatqna-tei-embedding-service
+    ports:
+      - "${CHATQNA_REDIS_DATAPREP_PORT}:5000"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      REDIS_URL: ${CHATQNA_REDIS_URL}
+      INDEX_NAME: ${CHATQNA_INDEX_NAME}
+      TEI_ENDPOINT: ${CHATQNA_TEI_EMBEDDING_ENDPOINT}
+      HUGGINGFACEHUB_API_TOKEN: ${CHATQNA_HUGGINGFACEHUB_API_TOKEN}
+  chatqna-tei-embedding-service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    container_name: chatqna-tei-embedding-server
+    ports:
+      - "${CHATQNA_TEI_EMBEDDING_PORT}:80"
+    volumes:
+      - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
+    shm_size: 1g
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    command: --model-id ${CHATQNA_EMBEDDING_MODEL_ID} --auto-truncate
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri/card1:/dev/dri/card1
+      - /dev/dri/renderD136:/dev/dri/renderD136
+    cap_add:
+      - SYS_PTRACE
+    group_add:
+      - video
+    security_opt:
+      - seccomp:unconfined
+  chatqna-retriever:
+    image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
+    container_name: chatqna-retriever-redis-server
+    depends_on:
+      - chatqna-redis-vector-db
+    ports:
+      - "${CHATQNA_REDIS_RETRIEVER_PORT}:7000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      REDIS_URL: ${CHATQNA_REDIS_URL}
+      INDEX_NAME: ${CHATQNA_INDEX_NAME}
+      TEI_EMBEDDING_ENDPOINT: ${CHATQNA_TEI_EMBEDDING_ENDPOINT}
+      LOGFLAG: ${LOGFLAG}
+      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
+    restart: unless-stopped
+  chatqna-tei-reranking-service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    container_name: chatqna-tei-reranking-server
+    ports:
+      - "${CHATQNA_TEI_RERANKING_PORT}:80"
+    volumes:
+      - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_API_TOKEN: ${CHATQNA_HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri/:/dev/dri/
+    cap_add:
+      - SYS_PTRACE
+    group_add:
+      - video
+    security_opt:
+      - seccomp:unconfined
+    command: --model-id ${CHATQNA_RERANK_MODEL_ID} --auto-truncate
+  chatqna-tgi-service:
+    image: ${CHATQNA_TGI_SERVICE_IMAGE}
+    container_name: chatqna-tgi-server
+    ports:
+      - "${CHATQNA_TGI_SERVICE_PORT}:80"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGING_FACE_HUB_TOKEN: ${CHATQNA_HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+    volumes:
+      - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
+    shm_size: 1g
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri/:/dev/dri/
+    cap_add:
+      - SYS_PTRACE
+    group_add:
+      - video
+    security_opt:
+      - seccomp:unconfined
+    command: --model-id ${CHATQNA_LLM_MODEL_ID}
+    ipc: host
+  chatqna-llm-faqgen:
+    image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
+    container_name: llm-faqgen-server
+    depends_on:
+      - chatqna-tgi-service
+    ports:
+      - ${CHATQNA_LLM_FAQGEN_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      HF_TOKEN: ${HF_TOKEN}
+      FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME:-OpeaFaqGenTgi}
+      LOGFLAG: ${LOGFLAG:-False}
+    restart: unless-stopped
+  chatqna-backend-server:
+    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
+    container_name: chatqna-backend-server
+    depends_on:
+      - chatqna-redis-vector-db
+      - chatqna-tei-embedding-service
+      - chatqna-retriever
+      - chatqna-tei-reranking-service
+      - chatqna-tgi-service
+      - chatqna-llm-faqgen
+    ports:
+      - "${CHATQNA_BACKEND_SERVICE_PORT}:8888"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=${CHATQNA_MEGA_SERVICE_HOST_IP}
+      - EMBEDDING_SERVER_HOST_IP=${HOST_IP}
+      - EMBEDDING_SERVER_PORT=${CHATQNA_TEI_EMBEDDING_PORT:-80}
+      - RETRIEVER_SERVICE_HOST_IP=${HOST_IP}
+      - RERANK_SERVER_HOST_IP=${HOST_IP}
+      - RERANK_SERVER_PORT=${CHATQNA_TEI_RERANKING_PORT:-80}
+      - LLM_SERVER_HOST_IP=${HOST_IP}
+      - LLM_SERVER_PORT=${CHATQNA_LLM_FAQGEN_PORT:-9000}
+      - LLM_MODEL=${CHATQNA_LLM_MODEL_ID}
+      - CHATQNA_TYPE=${CHATQNA_TYPE:-CHATQNA_FAQGEN}
+    ipc: host
+    restart: always
+  chatqna-ui-server:
+    image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest}
+    container_name: chatqna-ui-server
+    depends_on:
+      - chatqna-backend-server
+    ports:
+      - "${CHATQNA_FRONTEND_SERVICE_PORT}:5173"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - CHAT_BASE_URL=${CHATQNA_BACKEND_SERVICE_ENDPOINT}
+      - UPLOAD_FILE_BASE_URL=${CHATQNA_DATAPREP_SERVICE_ENDPOINT}
+      - GET_FILE=${CHATQNA_DATAPREP_GET_FILE_ENDPOINT}
+      - DELETE_FILE=${CHATQNA_DATAPREP_DELETE_FILE_ENDPOINT}
+    ipc: host
+    restart: always
+  chatqna-nginx-server:
+    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
+    container_name: chaqna-nginx-server
+    depends_on:
+      - chatqna-backend-server
+      - chatqna-ui-server
+    ports:
+      - "${CHATQNA_NGINX_PORT}:80"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - FRONTEND_SERVICE_IP=${CHATQNA_FRONTEND_SERVICE_IP}
+      - FRONTEND_SERVICE_PORT=${CHATQNA_FRONTEND_SERVICE_PORT}
+      - BACKEND_SERVICE_NAME=${CHATQNA_BACKEND_SERVICE_NAME}
+      - BACKEND_SERVICE_IP=${CHATQNA_BACKEND_SERVICE_IP}
+      - BACKEND_SERVICE_PORT=${CHATQNA_BACKEND_SERVICE_PORT}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/ChatQnA/docker_compose/amd/gpu/rocm/set_env.sh
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/set_env.sh
@@ -15,6 +15,7 @@ export CHATQNA_REDIS_VECTOR_PORT=16379
 export CHATQNA_REDIS_VECTOR_INSIGHT_PORT=8001
 export CHATQNA_REDIS_DATAPREP_PORT=6007
 export CHATQNA_REDIS_RETRIEVER_PORT=7000
+export CHATQNA_LLM_FAQGEN_PORT=18010
 export CHATQNA_INDEX_NAME="rag-redis"
 export CHATQNA_MEGA_SERVICE_HOST_IP=${HOST_IP}
 export CHATQNA_RETRIEVER_SERVICE_HOST_IP=${HOST_IP}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -1,6 +1,6 @@
 # Build Mega Service of ChatQnA on Xeon

-This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`.
+This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`,`llm` and `faqgen`.

 The default pipeline deploys with vLLM as the LLM serving component and leverages rerank component. It also provides options of not using rerank in the pipeline and using TGI backend for LLM microservice, please refer to [start-all-the-services-docker-containers](#start-all-the-services-docker-containers) section in this page. Besides, refer to [Build with Pinecone VectorDB](./README_pinecone.md) and [Build with Qdrant VectorDB](./README_qdrant.md) for other deployment variants.

@@ -30,7 +30,7 @@ To set up environment variables for deploying ChatQnA services, follow these ste
   export http_proxy="Your_HTTP_Proxy"
   export https_proxy="Your_HTTPs_Proxy"
   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
-   export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service
+   export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,llm-faqgen
   ```

 3. Set up other environment variables:
@@ -59,8 +59,10 @@ docker compose up -d
 To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.  
 CPU example with Open Telemetry feature:

+> NOTE : To get supported Grafana Dashboard, please run download_opea_dashboard.sh following below commands.
+
 ```bash
-cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+./grafana/dashboards/download_opea_dashboard.sh
 docker compose -f compose.yaml -f compose.telemetry.yaml up -d
 ```

@@ -139,29 +141,27 @@ docker build --no-cache -t opea/dataprep:latest --build-arg https_proxy=$https_p
 cd ..
 ```

-### 3. Build MegaService Docker Image
+### 3. Build FaqGen LLM Image (Optional)

-1. MegaService with Rerank
+If you want to enable FAQ generation LLM in the pipeline, please use the below command:

-   To construct the Mega Service with Rerank, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `chatqna.py` Python script. Build MegaService Docker image via below command:
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+docker build -t opea/llm-faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/faq-generation/Dockerfile .
+```

-   ```bash
-   git clone https://github.com/opea-project/GenAIExamples.git
-   cd GenAIExamples/ChatQnA
-   docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-   ```
+### 4. Build MegaService Docker Image

-2. MegaService without Rerank
+To construct the Mega Service with Rerank, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `chatqna.py` Python script. Build MegaService Docker image via below command:

-   To construct the Mega Service without Rerank, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `chatqna_without_rerank.py` Python script. Build MegaService Docker image via below command:
+```bash
+git clone https://github.com/opea-project/GenAIExamples.git
+cd GenAIExamples/ChatQnA
+docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```

-   ```bash
-   git clone https://github.com/opea-project/GenAIExamples.git
-   cd GenAIExamples/ChatQnA
-   docker build --no-cache -t opea/chatqna-without-rerank:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.without_rerank .
-   ```
-
-### 4. Build UI Docker Image
+### 5. Build UI Docker Image

 Build frontend Docker image via below command:

@@ -170,7 +170,7 @@ cd GenAIExamples/ChatQnA/ui
 docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
 ```

-### 5. Build Conversational React UI Docker Image (Optional)
+### 6. Build Conversational React UI Docker Image (Optional)

 Build frontend Docker image that enables Conversational experience with ChatQnA megaservice via below command:

@@ -181,7 +181,7 @@ cd GenAIExamples/ChatQnA/ui
 docker build --no-cache -t opea/chatqna-conversation-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
 ```

-### 6. Build Nginx Docker Image
+### 7. Build Nginx Docker Image

 ```bash
 cd GenAIComps
@@ -192,10 +192,14 @@ Then run the command `docker images`, you will have the following 5 Docker Image

 1. `opea/dataprep:latest`
 2. `opea/retriever:latest`
-3. `opea/chatqna:latest` or `opea/chatqna-without-rerank:latest`
+3. `opea/chatqna:latest`
 4. `opea/chatqna-ui:latest`
 5. `opea/nginx:latest`

+If FaqGen related docker image is built, you will find one more image:
+
+- `opea/llm-faqgen:latest`
+
 ## 🚀 Start Microservices

 ### Required Models
@@ -219,7 +223,7 @@ For users in China who are unable to download models directly from Huggingface,
   export HF_ENDPOINT="https://hf-mirror.com"
   model_name="meta-llama/Meta-Llama-3-8B-Instruct"
   # Start vLLM LLM Service
-   docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
+   docker run -p 8008:80 -v ./data:/root/.cache/huggingface/hub --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
   # Start TGI LLM Service
   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
   ```
@@ -236,7 +240,7 @@ For users in China who are unable to download models directly from Huggingface,
     export HF_TOKEN=${your_hf_token}
     export model_path="/path/to/model"
     # Start vLLM LLM Service
-     docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
+     docker run -p 8008:80 -v $model_path:/root/.cache/huggingface/hub --name vllm-service --shm-size 128g opea/vllm:latest --model /root/.cache/huggingface/hub --host 0.0.0.0 --port 80
     # Start TGI LLM Service
     docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
     ```
@@ -285,6 +289,8 @@ docker compose -f compose.yaml up -d
 docker compose -f compose_without_rerank.yaml up -d
 # Start ChatQnA with Rerank Pipeline and Open Telemetry Tracing
 docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+# Start ChatQnA with FaqGen Pipeline
+docker compose -f compose_faqgen.yaml up -d
 ```

 If use TGI as the LLM serving backend.
@@ -293,6 +299,8 @@ If use TGI as the LLM serving backend.
 docker compose -f compose_tgi.yaml up -d
 # Start ChatQnA with Open Telemetry Tracing
 docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml up -d
+# Start ChatQnA with FaqGen Pipeline
+docker compose -f compose_faqgen_tgi.yaml up -d
 ```

 ### Validate Microservices
@@ -367,7 +375,16 @@ For details on how to verify the correctness of the response, refer to [how-to-v
     -H 'Content-Type: application/json'
   ```

-5. MegaService
+5. FaqGen LLM Microservice (if enabled)
+
+```bash
+curl http://${host_ip}:${LLM_SERVICE_PORT}/v1/faqgen \
+  -X POST \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
+  -H 'Content-Type: application/json'
+```
+
+6. MegaService

   ```bash
    curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
@@ -375,7 +392,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
          }'
   ```

-6. Nginx Service
+7. Nginx Service

   ```bash
   curl http://${host_ip}:${NGINX_PORT}/v1/chatqna \
@@ -383,7 +400,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
       -d '{"messages": "What is the revenue of Nike in 2023?"}'
   ```

-7. Dataprep Microservice（Optional）
+8. Dataprep Microservice（Optional）

 If you want to update the default knowledge base, you can use the following commands:

--- a/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md
@@ -201,7 +201,7 @@ For users in China who are unable to download models directly from Huggingface,
   export HF_TOKEN=${your_hf_token}
   export HF_ENDPOINT="https://hf-mirror.com"
   model_name="meta-llama/Meta-Llama-3-8B-Instruct"
-   docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
+   docker run -p 8008:80 -v ./data:/root/.cache/huggingface/hub --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
   ```

 2. Offline
@@ -215,7 +215,7 @@ For users in China who are unable to download models directly from Huggingface,
     ```bash
     export HF_TOKEN=${your_hf_token}
     export model_path="/path/to/model"
-     docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
+     docker run -p 8008:80 -v $model_path:/root/.cache/huggingface/hub --name vllm-service --shm-size 128g opea/vllm:latest --model /root/.cache/huggingface/hub --host 0.0.0.0 --port 80
     ```

 ### Setup Environment Variables
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml
@@ -4,10 +4,19 @@
 services:
  tei-embedding-service:
    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+    environment:
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
  tei-reranking-service:
    command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+    environment:
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
+#  vllm-service:
+#    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --otlp-traces-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  chatqna-xeon-backend-server:
+    environment:
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
  jaeger:
-    image: jaegertracing/all-in-one:latest
+    image: jaegertracing/all-in-one:1.67.0
    container_name: jaeger
    ports:
      - "16686:16686"
@@ -21,7 +30,51 @@ services:
      https_proxy: ${https_proxy}
      COLLECTOR_ZIPKIN_HOST_PORT: 9411
    restart: unless-stopped
-  chatqna-xeon-backend-server:
+  prometheus:
+    image: prom/prometheus:v2.52.0
+    container_name: prometheus
+    user: root
+    volumes:
+      - ./prometheus.yaml:/etc/prometheus/prometheus.yaml
+      - ./prometheus_data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yaml'
+    ports:
+      - '9090:9090'
+    ipc: host
+    restart: unless-stopped
+  grafana:
+    image: grafana/grafana:11.0.0
+    container_name: grafana
+    volumes:
+      - ./grafana_data:/var/lib/grafana
+      - ./grafana/dashboards:/var/lib/grafana/dashboards
+      - ./grafana/provisioning:/etc/grafana/provisioning
+    user: root
    environment:
-      - ENABLE_OPEA_TELEMETRY=true
-      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
+      GF_SECURITY_ADMIN_PASSWORD: admin
+      GF_RENDERING_CALLBACK_URL: http://grafana:3000/
+      GF_LOG_FILTERS: rendering:debug
+    depends_on:
+      - prometheus
+    ports:
+      - '3000:3000'
+    ipc: host
+    restart: unless-stopped
+  node-exporter:
+    image: prom/node-exporter
+    container_name: node-exporter
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.sysfs=/host/sys'
+      - --collector.filesystem.ignored-mount-points
+      - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
+    ports:
+      - 9100:9100
+    restart: always
+    deploy:
+      mode: global
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,12 +8,19 @@ services:
    ports:
      - "6379:6379"
      - "8001:8001"
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 3s
+      retries: 10
  dataprep-redis-service:
    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
    container_name: dataprep-redis-server
    depends_on:
-      - redis-vector-db
-      - tei-embedding-service
+      redis-vector-db:
+        condition: service_healthy
+      tei-embedding-service:
+        condition: service_started
    ports:
      - "6007:5000"
    environment:
@@ -80,7 +87,7 @@ services:
    ports:
      - "9009:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen.yaml
@@ -0,0 +1,187 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  redis-vector-db:
+    image: redis/redis-stack:7.2.0-v9
+    container_name: redis-vector-db
+    ports:
+      - "${CHATQNA_REDIS_VECTOR_PORT:-6379}:6379"
+      - "${CHATQNA_REDIS_VECTOR_INSIGHT_PORT:-8001}:8001"
+  dataprep-redis-service:
+    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
+    container_name: dataprep-redis-server
+    depends_on:
+      - redis-vector-db
+      - tei-embedding-service
+    ports:
+      - "6007:5000"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      REDIS_URL: redis://redis-vector-db:6379
+      REDIS_HOST: redis-vector-db
+      INDEX_NAME: ${INDEX_NAME}
+      TEI_ENDPOINT: http://tei-embedding-service:80
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+  tei-embedding-service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    container_name: tei-embedding-server
+    ports:
+      - "6006:80"
+    volumes:
+      - "${MODEL_CACHE:-./data}:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
+  retriever:
+    image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
+    container_name: retriever-redis-server
+    depends_on:
+      - redis-vector-db
+    ports:
+      - "7000:7000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      REDIS_URL: redis://redis-vector-db:6379
+      REDIS_HOST: redis-vector-db
+      INDEX_NAME: ${INDEX_NAME}
+      TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LOGFLAG: ${LOGFLAG}
+      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
+    restart: unless-stopped
+  tei-reranking-service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    container_name: tei-reranking-server
+    ports:
+      - "8808:80"
+    volumes:
+      - "${MODEL_CACHE:-./data}:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+    command: --model-id ${RERANK_MODEL_ID} --auto-truncate
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+    container_name: vllm-server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-9009}:80
+    volumes:
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
+    shm_size: 128g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HF_TOKEN}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "${VLLM_TORCH_PROFILER_DIR:-/mnt}"
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+      VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
+  llm-faqgen:
+    image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
+    container_name: llm-faqgen-server
+    depends_on:
+      vllm-service:
+        condition: service_healthy
+    ports:
+      - ${LLM_SERVER_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      HF_TOKEN: ${HF_TOKEN}
+      FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME:-OpeaFaqGenvLLM}
+      LOGFLAG: ${LOGFLAG:-False}
+    restart: unless-stopped
+  chatqna-xeon-backend-server:
+    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
+    container_name: chatqna-xeon-backend-server
+    depends_on:
+      - redis-vector-db
+      - tei-embedding-service
+      - retriever
+      - tei-reranking-service
+      - vllm-service
+      - llm-faqgen
+    ports:
+      - ${CHATQNA_BACKEND_PORT:-8888}:8888
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=chatqna-xeon-backend-server
+      - EMBEDDING_SERVER_HOST_IP=tei-embedding-service
+      - EMBEDDING_SERVER_PORT=80
+      - RETRIEVER_SERVICE_HOST_IP=retriever
+      - RETRIEVER_SERVICE_PORT=7000
+      - RERANK_SERVER_HOST_IP=tei-reranking-service
+      - RERANK_SERVER_PORT=80
+      - LLM_SERVER_HOST_IP=llm-faqgen
+      - LLM_SERVER_PORT=9000
+      - LLM_MODEL=${LLM_MODEL_ID}
+      - LOGFLAG=${LOGFLAG}
+      - CHATQNA_TYPE=${CHATQNA_TYPE:-CHATQNA_FAQGEN}
+    ipc: host
+    restart: always
+  chatqna-xeon-ui-server:
+    image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest}
+    container_name: chatqna-xeon-ui-server
+    depends_on:
+      - chatqna-xeon-backend-server
+    ports:
+      - ${CHATQNA_FRONTEND_SERVICE_PORT:-5173}:5173
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+    ipc: host
+    restart: always
+  chatqna-xeon-nginx-server:
+    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
+    container_name: chatqna-xeon-nginx-server
+    depends_on:
+      - chatqna-xeon-backend-server
+      - chatqna-xeon-ui-server
+    ports:
+      - "${NGINX_PORT:-80}:80"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - FRONTEND_SERVICE_IP=chatqna-xeon-ui-server
+      - FRONTEND_SERVICE_PORT=5173
+      - BACKEND_SERVICE_NAME=chatqna
+      - BACKEND_SERVICE_IP=chatqna-xeon-backend-server
+      - BACKEND_SERVICE_PORT=8888
+      - DATAPREP_SERVICE_IP=dataprep-redis-service
+      - DATAPREP_SERVICE_PORT=5000
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_faqgen_tgi.yaml
@@ -0,0 +1,187 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  redis-vector-db:
+    image: redis/redis-stack:7.2.0-v9
+    container_name: redis-vector-db
+    ports:
+      - "${CHATQNA_REDIS_VECTOR_PORT:-6379}:6379"
+      - "${CHATQNA_REDIS_VECTOR_INSIGHT_PORT:-8001}:8001"
+  dataprep-redis-service:
+    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
+    container_name: dataprep-redis-server
+    depends_on:
+      - redis-vector-db
+      - tei-embedding-service
+    ports:
+      - "6007:5000"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      REDIS_URL: redis://redis-vector-db:6379
+      REDIS_HOST: redis-vector-db
+      INDEX_NAME: ${INDEX_NAME}
+      TEI_ENDPOINT: http://tei-embedding-service:80
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+  tei-embedding-service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    container_name: tei-embedding-server
+    ports:
+      - "6006:80"
+    volumes:
+      - "${MODEL_CACHE:-./data}:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
+  retriever:
+    image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
+    container_name: retriever-redis-server
+    depends_on:
+      - redis-vector-db
+    ports:
+      - "7000:7000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      REDIS_URL: redis://redis-vector-db:6379
+      REDIS_HOST: redis-vector-db
+      INDEX_NAME: ${INDEX_NAME}
+      TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LOGFLAG: ${LOGFLAG}
+      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
+    restart: unless-stopped
+  tei-reranking-service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    container_name: tei-reranking-server
+    ports:
+      - "8808:80"
+    volumes:
+      - "${MODEL_CACHE:-./data}:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+    command: --model-id ${RERANK_MODEL_ID} --auto-truncate
+  tgi-service:
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    container_name: tgi-server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-9009}:80
+    volumes:
+      - "${MODEL_CACHE:-./data}:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
+      HF_TOKEN: ${HF_TOKEN}
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+  llm-faqgen:
+    image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
+    container_name: llm-faqgen-server
+    depends_on:
+      tgi-service:
+        condition: service_healthy
+    ports:
+      - ${LLM_SERVER_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      HF_TOKEN: ${HF_TOKEN}
+      FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME:-OpeaFaqGenTgi}
+      LOGFLAG: ${LOGFLAG:-False}
+    restart: unless-stopped
+  chatqna-xeon-backend-server:
+    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
+    container_name: chatqna-xeon-backend-server
+    depends_on:
+      - redis-vector-db
+      - tei-embedding-service
+      - retriever
+      - tei-reranking-service
+      - tgi-service
+      - llm-faqgen
+    ports:
+      - ${CHATQNA_BACKEND_PORT:-8888}:8888
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=chatqna-xeon-backend-server
+      - EMBEDDING_SERVER_HOST_IP=tei-embedding-service
+      - EMBEDDING_SERVER_PORT=80
+      - RETRIEVER_SERVICE_HOST_IP=retriever
+      - RETRIEVER_SERVICE_PORT=7000
+      - RERANK_SERVER_HOST_IP=tei-reranking-service
+      - RERANK_SERVER_PORT=80
+      - LLM_SERVER_HOST_IP=llm-faqgen
+      - LLM_SERVER_PORT=9000
+      - LLM_MODEL=${LLM_MODEL_ID}
+      - LOGFLAG=${LOGFLAG}
+      - CHATQNA_TYPE=${CHATQNA_TYPE:-CHATQNA_FAQGEN}
+    ipc: host
+    restart: always
+  chatqna-xeon-ui-server:
+    image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest}
+    container_name: chatqna-xeon-ui-server
+    depends_on:
+      - chatqna-xeon-backend-server
+    ports:
+      - ${CHATQNA_FRONTEND_SERVICE_PORT:-5173}:5173
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+    ipc: host
+    restart: always
+  chatqna-xeon-nginx-server:
+    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
+    container_name: chatqna-xeon-nginx-server
+    depends_on:
+      - chatqna-xeon-backend-server
+      - chatqna-xeon-ui-server
+    ports:
+      - "${NGINX_PORT:-80}:80"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - FRONTEND_SERVICE_IP=chatqna-xeon-ui-server
+      - FRONTEND_SERVICE_PORT=5173
+      - BACKEND_SERVICE_NAME=chatqna
+      - BACKEND_SERVICE_IP=chatqna-xeon-backend-server
+      - BACKEND_SERVICE_PORT=8888
+      - DATAPREP_SERVICE_IP=dataprep-redis-service
+      - DATAPREP_SERVICE_PORT=5000
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
@@ -113,7 +113,7 @@ services:
    ports:
      - "6006:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -127,7 +127,7 @@ services:
    ports:
      - "8808:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -144,7 +144,7 @@ services:
    ports:
      - "9009:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
@@ -80,7 +80,7 @@ services:
    ports:
      - "6042:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.telemetry.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.telemetry.yaml
@@ -4,12 +4,21 @@
 services:
  tei-embedding-service:
    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+    environment:
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
  tei-reranking-service:
    command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+    environment:
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
  tgi-service:
    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+    environment:
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
+  chatqna-xeon-backend-server:
+    environment:
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
  jaeger:
-    image: jaegertracing/all-in-one:latest
+    image: jaegertracing/all-in-one:1.67.0
    container_name: jaeger
    ports:
      - "16686:16686"
@@ -23,7 +32,51 @@ services:
      https_proxy: ${https_proxy}
      COLLECTOR_ZIPKIN_HOST_PORT: 9411
    restart: unless-stopped
-  chatqna-xeon-backend-server:
+  prometheus:
+    image: prom/prometheus:v2.52.0
+    container_name: prometheus
+    user: root
+    volumes:
+      - ./prometheus.yaml:/etc/prometheus/prometheus.yaml
+      - ./prometheus_data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yaml'
+    ports:
+      - '9090:9090'
+    ipc: host
+    restart: unless-stopped
+  grafana:
+    image: grafana/grafana:11.0.0
+    container_name: grafana
+    volumes:
+      - ./grafana_data:/var/lib/grafana
+      - ./grafana/dashboards:/var/lib/grafana/dashboards
+      - ./grafana/provisioning:/etc/grafana/provisioning
+    user: root
    environment:
-      - ENABLE_OPEA_TELEMETRY=true
-      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
+      GF_SECURITY_ADMIN_PASSWORD: admin
+      GF_RENDERING_CALLBACK_URL: http://grafana:3000/
+      GF_LOG_FILTERS: rendering:debug
+    depends_on:
+      - prometheus
+    ports:
+      - '3000:3000'
+    ipc: host
+    restart: unless-stopped
+  node-exporter:
+    image: prom/node-exporter
+    container_name: node-exporter
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.sysfs=/host/sys'
+      - --collector.filesystem.ignored-mount-points
+      - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
+    ports:
+      - 9100:9100
+    restart: always
+    deploy:
+      mode: global
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
@@ -64,7 +64,7 @@ services:
    ports:
      - "9009:80"
    volumes:
-      - "${MODEL_CACHE:-./data}:/data"
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
@@ -75,7 +75,7 @@ services:
      VLLM_TORCH_PROFILER_DIR: "/mnt"
    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
  chatqna-xeon-backend-server:
-    image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
+    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-xeon-backend-server
    depends_on:
      - redis-vector-db
@@ -97,6 +97,7 @@ services:
      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
      - LLM_MODEL=${LLM_MODEL_ID}
      - LOGFLAG=${LOGFLAG}
+      - CHATQNA_TYPE=${CHATQNA_TYPE:-CHATQNA_NO_RERANK}
    ipc: host
    restart: always
  chatqna-xeon-ui-server:
--- a/ChatQnA/docker_compose/intel/cpu/xeon/grafana/dashboards/download_opea_dashboard.sh
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/grafana/dashboards/download_opea_dashboard.sh
@@ -0,0 +1,6 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/vllm_grafana.json
+wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/tgi_grafana.json
+wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/node_grafana.json
--- a/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/dashboards/local.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/dashboards/local.yaml
@@ -0,0 +1,14 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: 1
+
+providers:
+- name: 'default'
+  orgId: 1
+  folder: ''
+  type: file
+  disableDeletion: false
+  updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards
+  options:
+    path: /var/lib/grafana/dashboards
--- a/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/datasources/datasource.yml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/datasources/datasource.yml
@@ -0,0 +1,54 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# config file version
+apiVersion: 1
+
+# list of datasources that should be deleted from the database
+deleteDatasources:
+  - name: Prometheus
+    orgId: 1
+
+# list of datasources to insert/update depending
+# what's available in the database
+datasources:
+  # <string, required> name of the datasource. Required
+- name: Prometheus
+  # <string, required> datasource type. Required
+  type: prometheus
+  # <string, required> access mode. direct or proxy. Required
+  access: proxy
+  # <int> org id. will default to orgId 1 if not specified
+  orgId: 1
+  # <string> url
+  url: http://prometheus:9090
+  # <string> database password, if used
+  password:
+  # <string> database user, if used
+  user:
+  # <string> database name, if used
+  database:
+  # <bool> enable/disable basic auth
+  basicAuth: false
+  # <string> basic auth username, if used
+  basicAuthUser:
+  # <string> basic auth password, if used
+  basicAuthPassword:
+  # <bool> enable/disable with credentials headers
+  withCredentials:
+  # <bool> mark as default datasource. Max one per org
+  isDefault: true
+  # <map> fields that will be converted to json and stored in json_data
+  jsonData:
+     httpMethod: GET
+     graphiteVersion: "1.1"
+     tlsAuth: false
+     tlsAuthWithCACert: false
+  # <string> json object of data that will be encrypted.
+  secureJsonData:
+    tlsCACert: "..."
+    tlsClientCert: "..."
+    tlsClientKey: "..."
+  version: 1
+  # <bool> allow users to edit datasources from the UI.
+  editable: true
--- a/ChatQnA/docker_compose/intel/cpu/xeon/prometheus.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/prometheus.yaml
@@ -0,0 +1,43 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+# [IP_ADDR]:{PORT_OUTSIDE_CONTAINER} -> {PORT_INSIDE_CONTAINER} / {PROTOCOL}
+global:
+  scrape_interval: 5s
+  external_labels:
+    monitor: "my-monitor"
+scrape_configs:
+  - job_name: "prometheus"
+    static_configs:
+      - targets: ["prometheus:9090"]
+  - job_name: "vllm"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["vllm-service:80"]
+  - job_name: "tgi"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["tgi-service:80"]
+  - job_name: "tei-embedding"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["tei-embedding-server:80"]
+  - job_name: "tei-reranking"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["tei-reranking-server:80"]
+  - job_name: "retriever"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["retriever-redis-server:7000"]
+  - job_name: "dataprep-redis-service"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["dataprep-redis-server:5000"]
+  - job_name: "chatqna-backend-server"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["chatqna-xeon-backend-server:8888"]
+  - job_name: "prometheus-node-exporter"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["node-exporter:9100"]
--- a/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh
@@ -18,3 +18,15 @@ export LOGFLAG=""
 export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
 export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
+# Set no proxy
+export no_proxy="$no_proxy,chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,jaeger,prometheus,grafana,node-exporter,$JAEGER_IP"
+
+export LLM_ENDPOINT_PORT=8010
+export LLM_SERVER_PORT=9000
+export CHATQNA_BACKEND_PORT=8888
+export CHATQNA_REDIS_VECTOR_PORT=6379
+export CHATQNA_REDIS_VECTOR_INSIGHT_PORT=8001
+export CHATQNA_FRONTEND_SERVICE_PORT=5173
+export NGINX_PORT=80
+export FAQGen_COMPONENT_NAME="OpeaFaqGenvLLM"
+export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -1,91 +1,106 @@
-# Build MegaService of ChatQnA on Gaudi
+# Example ChatQnA deployments on an Intel® Gaudi® Platform

-This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`.
+This example covers the single-node on-premises deployment of the ChatQnA example using OPEA components. There are various ways to enable ChatQnA, but this example will focus on four options available for deploying the ChatQnA pipeline to Intel® Gaudi® AI Accelerators. This example begins with a Quick Start section and then documents how to modify deployments, leverage new models and configure the number of allocated devices.

-The default pipeline deploys with vLLM as the LLM serving component and leverages rerank component. It also provides options of not using rerank in the pipeline, leveraging guardrails, or using TGI backend for LLM microservice, please refer to [start-all-the-services-docker-containers](#start-all-the-services-docker-containers) section in this page.
+This example includes the following sections:

-Quick Start:
+- [ChatQnA Quick Start Deployment](#chatqna-quick-start-deployment): Demonstrates how to quickly deploy a ChatQnA application/pipeline on a Intel® Gaudi® platform.
+- [ChatQnA Docker Compose Files](#chatqna-docker-compose-files): Describes some example deployments and their docker compose files.
+- [ChatQnA Service Configuration](#chatqna-service-configuration): Describes the services and possible configuration changes.

-1. Set up the environment variables.
-2. Run Docker Compose.
-3. Consume the ChatQnA Service.
+**Note** This example requires access to a properly installed Intel® Gaudi® platform with a functional Docker service configured to use the habanalabs-container-runtime. Please consult the [Intel® Gaudi® software Installation Guide](https://docs.habana.ai/en/v1.20.0/Installation_Guide/Driver_Installation.html) for more information.

-Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). We now support running the latest DeepSeek models, including [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) and [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) on Gaudi accelerators. To run `deepseek-ai/DeepSeek-R1-Distill-Llama-70B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 8 in the [set_env.sh](./set_env.sh) script. To run `deepseek-ai/DeepSeek-R1-Distill-Qwen-32B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 4 in the [set_env.sh](./set_env.sh) script.
+## ChatQnA Quick Start Deployment

-## Quick Start: 1.Setup Environment Variable
+This section describes how to quickly deploy and test the ChatQnA service manually on an Intel® Gaudi® platform. The basic steps are:

-To set up environment variables for deploying ChatQnA services, follow these steps:
+1. [Access the Code](#access-the-code)
+2. [Generate a HuggingFace Access Token](#generate-a-huggingface-access-token)
+3. [Configure the Deployment Environment](#configure-the-deployment-environment)
+4. [Deploy the Services Using Docker Compose](#deploy-the-services-using-docker-compose)
+5. [Check the Deployment Status](#check-the-deployment-status)
+6. [Test the Pipeline](#test-the-pipeline)
+7. [Cleanup the Deployment](#cleanup-the-deployment)

-1. Set the required environment variables:
+### Access the Code

-   ```bash
-   # Example: host_ip="192.168.1.1"
-   export host_ip="External_Public_IP"
-   export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
-   ```
+Clone the GenAIExample repository and access the ChatQnA Intel® Gaudi® platform Docker Compose files and supporting scripts:

-2. If you are in a proxy environment, also set the proxy-related environment variables:
+```
+git clone https://github.com/opea-project/GenAIExamples.git
+cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
+```

-   ```bash
-   export http_proxy="Your_HTTP_Proxy"
-   export https_proxy="Your_HTTPs_Proxy"
-   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
-   export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails
-   ```
+Checkout a released version, such as v1.2:

-3. Set up other environment variables:
+```
+git checkout v1.2
+```

-   ```bash
-   source ./set_env.sh
-   ```
+### Generate a HuggingFace Access Token

-4. Change Model for LLM serving
+Some HuggingFace resources, such as some models, are only accessible if you have an access token. If you do not already have a HuggingFace access token, you can create one by first creating an account by following the steps provided at [HuggingFace](https://huggingface.co/) and then generating a [user access token](https://huggingface.co/docs/transformers.js/en/guides/private#step-1-generating-a-user-access-token).

-   By default, Meta-Llama-3-8B-Instruct is used for LLM serving, the default model can be changed to other validated LLM models.  
-   Please pick a [validated llm models](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/src/text-generation#validated-llm-models) from the table.  
-   To change the default model defined in set_env.sh, overwrite it by exporting LLM_MODEL_ID to the new model or by modifying set_env.sh, and then repeat step 3.  
-   For example, change to DeepSeek-R1-Distill-Qwen-32B using the following command.
+### Configure the Deployment Environment

-   ```bash
-   export LLM_MODEL_ID="deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
-   ```
+To set up environment variables for deploying ChatQnA services, source the _setup_env.sh_ script in this directory:

-   Please also check [required gaudi cards for different models](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/src/text-generation#system-requirements-for-llm-models) for new models.  
-   It might be necessary to increase the number of Gaudi cards for the model by exporting NUM_CARDS to the new model or by modifying set_env.sh, and then repeating step 3. For example, increase the number of Gaudi cards for DeepSeek-R1-
-   Distill-Qwen-32B using the following command:
+```
+source ./set_env.sh
+```

-   ```bash
-   export NUM_CARDS=4
-   ```
+The _set_env.sh_ script will prompt for required and optional environment variables used to configure the ChatQnA services. If a value is not entered, the script will use a default value for the same. It will also generate a _.env_ file defining the desired configuration. Consult the section on [ChatQnA Service configuration](#chatqna-service-configuration) for information on how service specific configuration parameters affect deployments.

-## Quick Start: 2.Run Docker Compose
+### Deploy the Services Using Docker Compose
+
+To deploy the ChatQnA services, execute the `docker compose up` command with the appropriate arguments. For a default deployment, execute:

 ```bash
 docker compose up -d
 ```

-To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
+The ChatQnA docker images should automatically be downloaded from the `OPEA registry` and deployed on the Intel® Gaudi® Platform:

-```bash
-docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+```
+[+] Running 10/10
+ ✔ Network gaudi_default                   Created                                                                      0.1s
+ ✔ Container tei-reranking-gaudi-server    Started                                                                      0.7s
+ ✔ Container vllm-gaudi-server             Started                                                                      0.7s
+ ✔ Container tei-embedding-gaudi-server    Started                                                                      0.3s
+ ✔ Container redis-vector-db               Started                                                                      0.6s
+ ✔ Container retriever-redis-server        Started                                                                      1.1s
+ ✔ Container dataprep-redis-server         Started                                                                      1.1s
+ ✔ Container chatqna-gaudi-backend-server  Started                                                                      1.3s
+ ✔ Container chatqna-gaudi-ui-server       Started                                                                      1.7s
+ ✔ Container chatqna-gaudi-nginx-server    Started                                                                      1.9s
 ```

-It will automatically download the docker image on `docker hub`:
+### Check the Deployment Status

-```bash
-docker pull opea/chatqna:latest
-docker pull opea/chatqna-ui:latest
+After running docker compose, check if all the containers launched via docker compose have started:
+
+```
+docker ps -a
 ```

-In following cases, you could build docker image from source by yourself.
+For the default deployment, the following 10 containers should have started:

- Failed to download the docker image.
+```
+CONTAINER ID   IMAGE                                                                                           COMMAND                  CREATED         STATUS                      PORTS                                                                                  NAMES
+8365b0a6024d   opea/nginx:latest                                                                               "/docker-entrypoint.…"   2 minutes ago   Up 2 minutes                0.0.0.0:80->80/tcp, :::80->80/tcp                                                      chatqna-gaudi-nginx-server
+f090fe262c74   opea/chatqna-ui:latest                                                                          "docker-entrypoint.s…"   2 minutes ago   Up 2 minutes                0.0.0.0:5173->5173/tcp, :::5173->5173/tcp                                              chatqna-gaudi-ui-server
+ec97d7651c96   opea/chatqna:latest                                                                             "python chatqna.py"      2 minutes ago   Up 2 minutes                0.0.0.0:8888->8888/tcp, :::8888->8888/tcp                                              chatqna-gaudi-backend-server
+a61fb7dc4fae   opea/dataprep:latest                                                                            "sh -c 'python $( [ …"   2 minutes ago   Up 2 minutes                0.0.0.0:6007->5000/tcp, [::]:6007->5000/tcp                                            dataprep-redis-server
+d560c232b120   opea/retriever:latest                                                                           "python opea_retriev…"   2 minutes ago   Up 2 minutes                0.0.0.0:7000->7000/tcp, :::7000->7000/tcp                                              retriever-redis-server
+a1d7ca2d3787   ghcr.io/huggingface/tei-gaudi:1.5.0                                                             "text-embeddings-rou…"   2 minutes ago   Up 2 minutes                0.0.0.0:8808->80/tcp, [::]:8808->80/tcp                                                tei-reranking-gaudi-server
+9a9f3fd4fd4c   opea/vllm-gaudi:latest                                                                          "python3 -m vllm.ent…"   2 minutes ago   Exited (1) 2 minutes ago                                                                                           vllm-gaudi-server
+1ab9bbdf5182   redis/redis-stack:7.2.0-v9                                                                      "/entrypoint.sh"         2 minutes ago   Up 2 minutes                0.0.0.0:6379->6379/tcp, :::6379->6379/tcp, 0.0.0.0:8001->8001/tcp, :::8001->8001/tcp   redis-vector-db
+9ee0789d819e   ghcr.io/huggingface/text-embeddings-inference:cpu-1.5                                           "text-embeddings-rou…"   2 minutes ago   Up 2 minutes                0.0.0.0:8090->80/tcp, [::]:8090->80/tcp                                                tei-embedding-gaudi-server
+```

- If you want to use a specific version of Docker image.
+### Test the Pipeline

-Please refer to 'Build Docker Images' in below.
-
-## QuickStart: 3.Consume the ChatQnA Service
+Once the ChatQnA services are running, test the pipeline using the following command:

 ```bash
 curl http://${host_ip}:8888/v1/chatqna \
@@ -95,504 +110,190 @@ curl http://${host_ip}:8888/v1/chatqna \
    }'
 ```

-## 🚀 Build Docker Images
+**Note** The value of _host_ip_ was set using the _set_env.sh_ script and can be found in the _.env_ file.

-First of all, you need to build Docker Images locally. This step can be ignored after the Docker images published to Docker hub.
+### Cleanup the Deployment

-```bash
-git clone https://github.com/opea-project/GenAIComps.git
-cd GenAIComps
+To stop the containers associated with the deployment, execute the following command:
+
+```
+docker compose -f compose.yaml down
 ```

-### 1. Build Retriever Image
-
-```bash
-docker build --no-cache -t opea/retriever:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/src/Dockerfile .
+```
+[+] Running 10/10
+ ✔ Container chatqna-gaudi-nginx-server    Removed                                                                                                 10.5s
+ ✔ Container dataprep-redis-server         Removed                                                                                                 10.5s
+ ✔ Container chatqna-gaudi-ui-server       Removed                                                                                                 10.3s
+ ✔ Container chatqna-gaudi-backend-server  Removed                                                                                                 10.3s
+ ✔ Container vllm-gaudi-server             Removed                                                                                                  0.0s
+ ✔ Container retriever-redis-server        Removed                                                                                                 10.4s
+ ✔ Container tei-reranking-gaudi-server    Removed                                                                                                  2.0s
+ ✔ Container tei-embedding-gaudi-server    Removed                                                                                                  1.2s
+ ✔ Container redis-vector-db               Removed                                                                                                  0.4s
+ ✔ Network gaudi_default                   Removed                                                                                                  0.4s
 ```

-### 2. Build Dataprep Image
+All the ChatQnA containers will be stopped and then removed on completion of the "down" command.
+
+## ChatQnA Docker Compose Files
+
+In the context of deploying a ChatQnA pipeline on an Intel® Gaudi® platform, the allocation and utilization of Gaudi devices across different services are important considerations for optimizing performance and resource efficiency. Each of the four example deployments, defined by the example Docker compose yaml files, demonstrates a unique approach to leveraging Gaudi hardware, reflecting different priorities and operational strategies.
+
+### compose.yaml - Default Deployment
+
+The default deployment utilizes Gaudi devices primarily for the `vllm-service`, which handles large language model (LLM) tasks. This service is configured to maximize the use of Gaudi's capabilities, potentially allocating multiple devices to enhance parallel processing and throughput. The `tei-reranking-service` also uses Gaudi hardware (1 card), however, indicating a balanced approach where both LLM processing and reranking tasks benefit from Gaudi's performance enhancements.
+
+| Service Name                 | Image Name                                            | Gaudi Use    |
+| ---------------------------- | ----------------------------------------------------- | ------------ |
+| redis-vector-db              | redis/redis-stack:7.2.0-v9                            | No           |
+| dataprep-redis-service       | opea/dataprep:latest                                  | No           |
+| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 | No           |
+| retriever                    | opea/retriever:latest                                 | No           |
+| tei-reranking-service        | ghcr.io/huggingface/tei-gaudi:1.5.0                   | 1 card       |
+| vllm-service                 | opea/vllm-gaudi:latest                                | Configurable |
+| chatqna-gaudi-backend-server | opea/chatqna:latest                                   | No           |
+| chatqna-gaudi-ui-server      | opea/chatqna-ui:latest                                | No           |
+| chatqna-gaudi-nginx-server   | opea/nginx:latest                                     | No           |
+
+### compose_tgi.yaml - TGI Deployment
+
+The TGI (Text Generation Inference) deployment and the default deployment differ primarily in their service configurations and specific focus on handling large language models (LLMs). The TGI deployment includes a unique `tgi-service`, which utilizes the `ghcr.io/huggingface/tgi-gaudi:2.0.6` image and is specifically configured to run on Gaudi hardware. This service is designed to handle LLM tasks with optimizations such as `ENABLE_HPU_GRAPH` and `USE_FLASH_ATTENTION`. The `chatqna-gaudi-backend-server` in the TGI deployment depends on the `tgi-service`, whereas in the default deployment, it relies on the `vllm-service`.
+
+| Service Name                 | Image Name                                            | Gaudi Specific |
+| ---------------------------- | ----------------------------------------------------- | -------------- |
+| redis-vector-db              | redis/redis-stack:7.2.0-v9                            | No             |
+| dataprep-redis-service       | opea/dataprep:latest                                  | No             |
+| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 | No             |
+| retriever                    | opea/retriever:latest                                 | No             |
+| tei-reranking-service        | ghcr.io/huggingface/tei-gaudi:1.5.0                   | 1 card         |
+| **tgi-service**              | ghcr.io/huggingface/tgi-gaudi:2.0.6                   | Configurable   |
+| chatqna-gaudi-backend-server | opea/chatqna:latest                                   | No             |
+| chatqna-gaudi-ui-server      | opea/chatqna-ui:latest                                | No             |
+| chatqna-gaudi-nginx-server   | opea/nginx:latest                                     | No             |
+
+This deployment may allocate more Gaudi resources to the tgi-service to optimize LLM tasks depending on the specific configuration and workload requirements.
+
+### compose_faqgen.yaml - FAQ generation Deployment
+
+The FAQs(frequently asked questions and answers) generation Deployment will generate FAQs instead of normally text generation. It add a new microservice called `llm-faqgen`, which is a microservice that interacts with the TGI/vLLM LLM server to generate FAQs from input text.
+
+The TGI (Text Generation Inference) deployment and the default deployment differ primarily in their service configurations and specific focus on handling large language models (LLMs). The TGI deployment includes a unique `tgi-service`, which utilizes the `ghcr.io/huggingface/tgi-gaudi:2.0.6` image and is specifically configured to run on Gaudi hardware. This service is designed to handle LLM tasks with optimizations such as `ENABLE_HPU_GRAPH` and `USE_FLASH_ATTENTION`. The `chatqna-gaudi-backend-server` in the TGI deployment depends on the `tgi-service`, whereas in the default deployment, it relies on the `vllm-service`.
+
+| Service Name                 | Image Name                                            | Gaudi Use    |
+| ---------------------------- | ----------------------------------------------------- | ------------ |
+| redis-vector-db              | redis/redis-stack:7.2.0-v9                            | No           |
+| dataprep-redis-service       | opea/dataprep:latest                                  | No           |
+| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 | No           |
+| retriever                    | opea/retriever:latest                                 | No           |
+| tei-reranking-service        | ghcr.io/huggingface/tei-gaudi:1.5.0                   | 1 card       |
+| vllm-service                 | opea/vllm-gaudi:latest                                | Configurable |
+| **llm-faqgen**               | **opea/llm-faqgen:latest**                            | No           |
+| chatqna-gaudi-backend-server | opea/chatqna:latest                                   | No           |
+| chatqna-gaudi-ui-server      | opea/chatqna-ui:latest                                | No           |
+| chatqna-gaudi-nginx-server   | opea/nginx:latest                                     | No           |
+
+We also provided a TGI based deployment for FAQ generation `compose_faqgen_tgi.yaml`, which only replace `vllm-service` with `tgi-service`.
+
+### compose_without_rerank.yaml - No ReRank Deployment
+
+The _compose_without_rerank.yaml_ Docker Compose file is distinct from the default deployment primarily due to the exclusion of the reranking service. In this version, the `tei-reranking-service`, which is typically responsible for providing reranking capabilities for text embeddings and is configured to run on Gaudi hardware, is absent. This omission simplifies the service architecture by removing a layer of processing that would otherwise enhance the ranking of text embeddings. As a result, the backend server's dependencies are adjusted, without the need for the reranking service. This streamlined setup may impact the application's functionality and performance by focusing on core operations without the additional processing layer provided by reranking, potentially making it more efficient for scenarios where reranking is not essential and freeing Intel® Gaudi® accelerators for other tasks.
+
+| Service Name                 | Image Name                                            | Gaudi Specific |
+| ---------------------------- | ----------------------------------------------------- | -------------- |
+| redis-vector-db              | redis/redis-stack:7.2.0-v9                            | No             |
+| dataprep-redis-service       | opea/dataprep:latest                                  | No             |
+| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 | No             |
+| retriever                    | opea/retriever:latest                                 | No             |
+| vllm-service                 | opea/vllm-gaudi:latest                                | Configurable   |
+| chatqna-gaudi-backend-server | opea/chatqna:latest                                   | No             |
+| chatqna-gaudi-ui-server      | opea/chatqna-ui:latest                                | No             |
+| chatqna-gaudi-nginx-server   | opea/nginx:latest                                     | No             |
+
+This setup might allow for more Gaudi devices to be dedicated to the `vllm-service`, enhancing LLM processing capabilities and accommodating larger models. However, it also means that the benefits of reranking are sacrificed, which could impact the overall quality of the pipeline's output.
+
+### compose_guardrails.yaml - Guardrails Deployment
+
+The _compose_guardrails.yaml_ Docker Compose file introduces enhancements over the default deployment by incorporating additional services focused on safety and ChatQnA response control. Notably, it includes the `tgi-guardrails-service` and `guardrails` services. The `tgi-guardrails-service` uses the `ghcr.io/huggingface/tgi-gaudi:2.0.6` image and is configured to run on Gaudi hardware, providing functionality to manage input constraints and ensure safe operations within defined limits. The guardrails service, using the `opea/guardrails:latest` image, acts as a safety layer that interfaces with the `tgi-guardrails-service` to enforce safety protocols and manage interactions with the large language model (LLM). This backend server now depends on the `tgi-guardrails-service` and `guardrails`, alongside existing dependencies like `redis-vector-db`, `tei-embedding-service`, `retriever`, `tei-reranking-service`, and `vllm-service`. The environment configurations for the backend are also updated to include settings for the guardrail services.
+
+| Service Name                 | Image Name                                            | Gaudi Specific | Uses LLM |
+| ---------------------------- | ----------------------------------------------------- | -------------- | -------- |
+| redis-vector-db              | redis/redis-stack:7.2.0-v9                            | No             | No       |
+| dataprep-redis-service       | opea/dataprep:latest                                  | No             | No       |
+| _tgi-guardrails-service_     | ghcr.io/huggingface/tgi-gaudi:2.0.6                   | 1 card         | Yes      |
+| _guardrails_                 | opea/guardrails:latest                                | No             | No       |
+| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 | No             | No       |
+| retriever                    | opea/retriever:latest                                 | No             | No       |
+| tei-reranking-service        | ghcr.io/huggingface/tei-gaudi:1.5.0                   | 1 card         | No       |
+| vllm-service                 | opea/vllm-gaudi:latest                                | Configurable   | Yes      |
+| chatqna-gaudi-backend-server | opea/chatqna:latest                                   | No             | No       |
+| chatqna-gaudi-ui-server      | opea/chatqna-ui:latest                                | No             | No       |
+| chatqna-gaudi-nginx-server   | opea/nginx:latest                                     | No             | No       |
+
+The deployment with guardrails introduces additional Gaudi-specific services, such as the `tgi-guardrails-service`, which necessitates careful consideration of Gaudi allocation. This deployment aims to balance safety and performance, potentially requiring a strategic distribution of Gaudi devices between the guardrail services and the LLM tasks to maintain both operational safety and efficiency.
+
+### Telemetry Enablement - compose.telemetry.yaml and compose_tgi.telemetry.yaml
+
+The telemetry Docker Compose files are incremental configurations designed to enhance existing deployments by integrating telemetry metrics, thereby providing valuable insights into the performance and behavior of certain services. This setup modifies specific services, such as the `tgi-service`, `tei-embedding-service` and `tei-reranking-service`, by adding a command-line argument that specifies an OpenTelemetry Protocol (OTLP) endpoint. This enables these services to export telemetry data to a designated endpoint, facilitating detailed monitoring and analysis. The `chatqna-gaudi-backend-server` is configured with environment variables that enable telemetry and specify the telemetry endpoint, ensuring that the backend server's operations are also monitored.
+
+Additionally, the telemetry files introduce a new service, `jaeger`, which uses the `jaegertracing/all-in-one:latest` image. Jaeger is a powerful open-source tool for tracing and monitoring distributed systems, offering a user-friendly interface for visualizing traces and understanding the flow of requests through the system.
+
+To enable Open Telemetry Tracing, compose.telemetry.yaml file needs to be merged along with default compose.yaml file on deployment:

-```bash
-docker build --no-cache -t opea/dataprep:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/src/Dockerfile .
 ```
-
-### 3. Build Guardrails Docker Image (Optional)
-
-To fortify AI initiatives in production, Guardrails microservice can secure model inputs and outputs, building Trustworthy, Safe, and Secure LLM-based Applications.
-
-```bash
-docker build -t opea/guardrails:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/guardrails/src/guardrails/Dockerfile .
-```
-
-### 4. Build MegaService Docker Image
-
-1. MegaService with Rerank
-
-   To construct the Mega Service with Rerank, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `chatqna.py` Python script. Build the MegaService Docker image using the command below:
-
-   ```bash
-   git clone https://github.com/opea-project/GenAIExamples.git
-   cd GenAIExamples/ChatQnA
-   docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-   ```
-
-2. MegaService with Guardrails
-
-   If you want to enable guardrails microservice in the pipeline, please use the below command instead:
-
-   ```bash
-   git clone https://github.com/opea-project/GenAIExamples.git
-   cd GenAIExamples/ChatQnA/
-   docker build --no-cache -t opea/chatqna-guardrails:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.guardrails .
-   ```
-
-3. MegaService without Rerank
-
-   To construct the Mega Service without Rerank, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `chatqna_without_rerank.py` Python script. Build MegaService Docker image via below command:
-
-   ```bash
-   git clone https://github.com/opea-project/GenAIExamples.git
-   cd GenAIExamples/ChatQnA
-   docker build --no-cache -t opea/chatqna-without-rerank:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.without_rerank .
-   ```
-
-### 5. Build UI Docker Image
-
-Construct the frontend Docker image using the command below:
-
-```bash
-cd GenAIExamples/ChatQnA/ui
-docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
-```
-
-### 6. Build Conversational React UI Docker Image (Optional)
-
-Build frontend Docker image that enables Conversational experience with ChatQnA megaservice via below command:
-
-**Export the value of the public IP address of your Gaudi node to the `host_ip` environment variable**
-
-```bash
-cd GenAIExamples/ChatQnA/ui
-docker build --no-cache -t opea/chatqna-conversation-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
-```
-
-### 7. Build Nginx Docker Image
-
-```bash
-cd GenAIComps
-docker build -t opea/nginx:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/nginx/src/Dockerfile .
-```
-
-Then run the command `docker images`, you will have the following 5 Docker Images:
-
- `opea/retriever:latest`
- `opea/dataprep:latest`
- `opea/chatqna:latest`
- `opea/chatqna-ui:latest`
- `opea/nginx:latest`
-
-If Conversation React UI is built, you will find one more image:
-
- `opea/chatqna-conversation-ui:latest`
-
-If Guardrails docker image is built, you will find one more image:
-
- `opea/guardrails:latest`
-
-## 🚀 Start MicroServices and MegaService
-
-### Required Models
-
-By default, the embedding, reranking and LLM models are set to a default value as listed below:
-
-| Service   | Model                               |
-| --------- | ----------------------------------- |
-| Embedding | BAAI/bge-base-en-v1.5               |
-| Reranking | BAAI/bge-reranker-base              |
-| LLM       | meta-llama/Meta-Llama-3-8B-Instruct |
-
-Change the `xxx_MODEL_ID` below for your needs.
-
-For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. The vLLM/TGI can load the models either online or offline as described below:
-
-1. Online
-
-   ```bash
-   export HF_TOKEN=${your_hf_token}
-   export HF_ENDPOINT="https://hf-mirror.com"
-   model_name="meta-llama/Meta-Llama-3-8B-Instruct"
-   # Start vLLM LLM Service
-   docker run -p 8007:80 -v ./data:/data --name vllm-gaudi-server -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e VLLM_TORCH_PROFILER_DIR="/mnt" --cap-add=sys_nice --ipc=host opea/vllm-gaudi:latest --model $model_name --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
-   # Start TGI LLM Service
-   docker run -p 8005:80 -v ./data:/data --name tgi-gaudi-server -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true -e USE_FLASH_ATTENTION=true -e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.6 --model-id $model_name --max-input-tokens 1024 --max-total-tokens 2048
-   ```
-
-2. Offline
-
-   - Search your model name in ModelScope. For example, check [this page](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/files) for model `Meta-Llama-3-8B-Instruct`.
-
-   - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`.
-
-   - Run the following command to start the LLM service.
-
-     ```bash
-     export HF_TOKEN=${your_hf_token}
-     export model_path="/path/to/model"
-     # Start vLLM LLM Service
-     docker run -p 8007:80 -v $model_path:/data --name vllm-gaudi-server --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e VLLM_TORCH_PROFILER_DIR="/mnt" --cap-add=sys_nice --ipc=host opea/vllm-gaudi:latest --model /data --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
-     # Start TGI LLM Service
-     docker run -p 8005:80 -v $model_path:/data --name tgi-gaudi-server --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true -e USE_FLASH_ATTENTION=true -e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.6 --model-id /data --max-input-tokens 1024 --max-total-tokens 2048
-     ```
-
-### Setup Environment Variables
-
-1. Set the required environment variables:
-
-   ```bash
-   # Example: host_ip="192.168.1.1"
-   export host_ip="External_Public_IP"
-   export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
-   # Example: NGINX_PORT=80
-   export NGINX_PORT=${your_nginx_port}
-   ```
-
-2. If you are in a proxy environment, also set the proxy-related environment variables:
-
-   ```bash
-   export http_proxy="Your_HTTP_Proxy"
-   export https_proxy="Your_HTTPs_Proxy"
-   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
-   export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails
-   ```
-
-3. Set up other environment variables:
-
-   ```bash
-   source ./set_env.sh
-   ```
-
-### Start all the services Docker Containers
-
-```bash
-cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
-```
-
-If use vLLM as the LLM serving backend.
-
-```bash
-# Start ChatQnA with Rerank Pipeline
-docker compose -f compose.yaml up -d
-# Start ChatQnA without Rerank Pipeline
-docker compose -f compose_without_rerank.yaml up -d
-# Start ChatQnA with Rerank Pipeline and Open Telemetry Tracing
 docker compose -f compose.yaml -f compose.telemetry.yaml up -d
 ```

-If use TGI as the LLM serving backend.
+For a TGI Deployment, this would become:

-```bash
-docker compose -f compose_tgi.yaml up -d
-# Start ChatQnA with Open Telemetry Tracing
+```
 docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml up -d
 ```

-If you want to enable guardrails microservice in the pipeline, please follow the below command instead:
+## ChatQnA Service Configuration

-```bash
-cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
-docker compose -f compose_guardrails.yaml up -d
-```
+The table provides a comprehensive overview of the ChatQnA services utilized across various deployments as illustrated in the example Docker Compose files. Each row in the table represents a distinct service, detailing its possible images used to enable it and a concise description of its function within the deployment architecture. These services collectively enable functionalities such as data storage and management, text embedding, retrieval, reranking, and large language model processing. Additionally, specialized services like `tgi-service` and `guardrails` are included to enhance text generation inference and ensure operational safety, respectively. The table also highlights the integration of telemetry through the `jaeger` service, which provides tracing and monitoring capabilities.

-> **_NOTE:_** Users need at least two Gaudi cards to run the ChatQnA successfully.
+| Service Name                 | Possible Image Names                                  | Optional | Description                                                                                        |
+| ---------------------------- | ----------------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------- |
+| redis-vector-db              | redis/redis-stack:7.2.0-v9                            | No       | Acts as a Redis database for storing and managing data.                                            |
+| dataprep-redis-service       | opea/dataprep:latest                                  | No       | Prepares data and interacts with the Redis database.                                               |
+| tei-embedding-service        | ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 | No       | Provides text embedding services, often using Hugging Face models.                                 |
+| retriever                    | opea/retriever:latest                                 | No       | Retrieves data from the Redis database and interacts with embedding services.                      |
+| tei-reranking-service        | ghcr.io/huggingface/tei-gaudi:1.5.0                   | Yes      | Reranks text embeddings, typically using Gaudi hardware for enhanced performance.                  |
+| vllm-service                 | opea/vllm-gaudi:latest                                | No       | Handles large language model (LLM) tasks, utilizing Gaudi hardware.                                |
+| tgi-service                  | ghcr.io/huggingface/tgi-gaudi:2.0.6                   | Yes      | Specific to the TGI deployment, focuses on text generation inference using Gaudi hardware.         |
+| tgi-guardrails-service       | ghcr.io/huggingface/tgi-gaudi:2.0.6                   | Yes      | Provides guardrails functionality, ensuring safe operations within defined limits.                 |
+| guardrails                   | opea/guardrails:latest                                | Yes      | Acts as a safety layer, interfacing with the `tgi-guardrails-service` to enforce safety protocols. |
+| chatqna-gaudi-backend-server | opea/chatqna:latest                                   | No       | Serves as the backend for the ChatQnA application, with variations depending on the deployment.    |
+| chatqna-gaudi-ui-server      | opea/chatqna-ui:latest                                | No       | Provides the user interface for the ChatQnA application.                                           |
+| chatqna-gaudi-nginx-server   | opea/nginx:latest                                     | No       | Acts as a reverse proxy, managing traffic between the UI and backend services.                     |
+| jaeger                       | jaegertracing/all-in-one:latest                       | Yes      | Provides tracing and monitoring capabilities for distributed systems.                              |

-### Validate MicroServices and MegaService
+Many of these services provide pipeline support required for all ChatQnA deployments, and are not specific to supporting the Intel® Gaudi® platform. Therefore, while the `redis-vector-db`, `dataprep-redis-service`, `retriever`, `chatqna-gaudi-backend-server`, `chatqna-gaudi-ui-server`, `chatqna-gaudi-nginx-server`, `jaeger` are configurable, they will not be covered by this example, which will focus on the configuration specifics of the services modified to support the Intel® Gaudi® platform.

-Follow the instructions to validate MicroServices.
-For validation details, please refer to [how-to-validate_service](./how_to_validate_service.md).
+### vllm-service & tgi-service

-1. TEI Embedding Service
+In the configuration of the `vllm-service` and the `tgi-service`, two variables play a primary role in determining the service's performance and functionality: `LLM_MODEL_ID` and `NUM_CARDS`. Both can be set using the appropriate environment variables. The `LLM_MODEL_ID` parameter specifies the particular large language model (LLM) that the service will utilize, effectively determining the capabilities and characteristics of the language processing tasks it can perform. This model identifier ensures that the service is aligned with the specific requirements of the application, whether it involves text generation, comprehension, or other language-related tasks. The `NUM_CARDS` parameter dictates the number of Gaudi devices allocated to the service. A higher number of Gaudi devices can enhance parallel processing capabilities, reduce latency, and improve throughput.

-   ```bash
-   curl ${host_ip}:8090/embed \
-       -X POST \
-       -d '{"inputs":"What is Deep Learning?"}' \
-       -H 'Content-Type: application/json'
-   ```
+However, developers need to be aware of the models that have been tested with the respective service image supporting the `vllm-service` and `tgi-service`. For example, documentation for the OPEA GenAIComps v1.0 release specify the list of [validated LLM models](https://github.com/opea-project/GenAIComps/blob/v1.0/comps/llms/text-generation/README.md#validated-llm-models) for each Gaudi enabled service image. Specific models may have stringent requirements on the number of Intel® Gaudi® devices required to support them.

-2. Retriever Microservice
+#### Deepseek Model Support for Intel® Gaudi® Platform ChatQnA pipeline

-   To consume the retriever microservice, you need to generate a mock embedding vector by Python script. The length of embedding vector
-   is determined by the embedding model.
-   Here we use the model `EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"`, which vector size is 768.
+ChatQnA now supports running the latest DeepSeek models, including [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) and [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) on Gaudi accelerators. To run `deepseek-ai/DeepSeek-R1-Distill-Llama-70B`, set the `LLM_MODEL_ID` appropriately and the `NUM_CARDS` to 8. To run `deepseek-ai/DeepSeek-R1-Distill-Qwen-32B`, update the `LLM_MODEL_ID` appropriately and set the `NUM_CARDS` to 4.

-   Check the vecotor dimension of your embedding model, set `your_embedding` dimension equals to it.
+### tei-embedding-service & tei-reranking-service

-   ```bash
-   export your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-   curl http://${host_ip}:7000/v1/retrieval \
-     -X POST \
-     -d "{\"text\":\"test\",\"embedding\":${your_embedding}}" \
-     -H 'Content-Type: application/json'
-   ```
+The `ghcr.io/huggingface/text-embeddings-inference:cpu-1.5` image supporting `tei-embedding-service` and `tei-reranking-service` depends on the `EMBEDDING_MODEL_ID` or `RERANK_MODEL_ID` environment variables respectively to specify the embedding model and reranking model used for converting text into vector representations and rankings. This choice impacts the quality and relevance of the embeddings rerankings for various applications. Unlike the `vllm-service`, the `tei-embedding-service` and `tei-reranking-service` each typically acquires only one Gaudi device and does not use the `NUM_CARDS` parameter; embedding and reranking tasks generally do not require extensive parallel processing and one Gaudi per service is appropriate. The list of [supported embedding and reranking models](https://github.com/huggingface/tei-gaudi?tab=readme-ov-file#supported-models) can be found at the the [huggingface/tei-gaudi](https://github.com/huggingface/tei-gaudi?tab=readme-ov-file#supported-models) website.

-3. TEI Reranking Service
+### tgi-gaurdrails-service

-   > Skip for ChatQnA without Rerank pipeline
+The `tgi-guardrails-service` uses the `GUARDRAILS_MODEL_ID` parameter to select a [supported model](https://github.com/huggingface/tgi-gaudi?tab=readme-ov-file#tested-models-and-configurations) for the associated `ghcr.io/huggingface/tgi-gaudi:2.0.6` image. Like the `tei-embedding-service` and `tei-reranking-service` services, it doesn't use the `NUM_CARDS` parameter.

-   ```bash
-   curl http://${host_ip}:8808/rerank \
-       -X POST \
-       -d '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}' \
-       -H 'Content-Type: application/json'
-   ```
+## Conclusion

-4. LLM backend Service
+In examining the various services and configurations across different deployments, developers should gain a comprehensive understanding of how each component contributes to the overall functionality and performance of a ChatQnA pipeline on an Intel® Gaudi® platform. Key services such as the `vllm-service`, `tei-embedding-service`, `tei-reranking-service`, and `tgi-guardrails-service` each consume Gaudi accelerators, leveraging specific models and hardware resources to optimize their respective tasks. The `LLM_MODEL_ID`, `EMBEDDING_MODEL_ID`, `RERANK_MODEL_ID`, and `GUARDRAILS_MODEL_ID` parameters specify the models used, directly impacting the quality and effectiveness of language processing, embedding, reranking, and safety operations.

-   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
+The allocation of Gaudi devices, affected by the Gaudi dependent services and the `NUM_CARDS` parameter supporting the `vllm-service` or `tgi-service`, determines where computational power is utilized to enhance performance.

-   Try the command below to check whether the LLM serving is ready.
-
-   ```bash
-   # vLLM service
-   docker logs vllm-gaudi-server 2>&1 | grep complete
-   # If the service is ready, you will get the response like below.
-   INFO:     Application startup complete.
-   ```
-
-   ```bash
-   # TGI service
-   docker logs tgi-gaudi-server | grep Connected
-   If the service is ready, you will get the response like below.
-   2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
-   ```
-
-   Then try the `cURL` command below to validate services.
-
-   ```bash
-   # vLLM Service
-   curl http://${host_ip}:8007/v1/chat/completions \
-     -X POST \
-     -d '{"model": ${LLM_MODEL_ID}, "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
-     -H 'Content-Type: application/json'
-   ```
-
-   ```bash
-   # TGI service
-   curl http://${host_ip}:8005/v1/chat/completions \
-     -X POST \
-     -d '{"model": ${LLM_MODEL_ID}, "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
-     -H 'Content-Type: application/json'
-   ```
-
-5. MegaService
-
-   ```bash
-   curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
-         "messages": "What is the revenue of Nike in 2023?"
-         }'
-   ```
-
-6. Nginx Service
-
-   ```bash
-   curl http://${host_ip}:${NGINX_PORT}/v1/chatqna \
-       -H "Content-Type: application/json" \
-       -d '{"messages": "What is the revenue of Nike in 2023?"}'
-   ```
-
-7. Dataprep Microservice（Optional）
-
-If you want to update the default knowledge base, you can use the following commands:
-
-Update Knowledge Base via Local File Upload:
-
-```bash
-curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \
-     -H "Content-Type: multipart/form-data" \
-     -F "files=@./nke-10k-2023.pdf"
-```
-
-This command updates a knowledge base by uploading a local file for processing. Update the file path according to your environment.
-
-Add Knowledge Base via HTTP Links:
-
-```bash
-curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \
-     -H "Content-Type: multipart/form-data" \
-     -F 'link_list=["https://opea.dev"]'
-```
-
-This command updates a knowledge base by submitting a list of HTTP links for processing.
-
-Also, you are able to get the file/link list that you uploaded:
-
-```bash
-curl -X POST "http://${host_ip}:6007/v1/dataprep/get" \
-     -H "Content-Type: application/json"
-```
-
-Then you will get the response JSON like this. Notice that the returned `name`/`id` of the uploaded link is `https://xxx.txt`.
-
-```json
-[
-  {
-    "name": "nke-10k-2023.pdf",
-    "id": "nke-10k-2023.pdf",
-    "type": "File",
-    "parent": ""
-  },
-  {
-    "name": "https://opea.dev.txt",
-    "id": "https://opea.dev.txt",
-    "type": "File",
-    "parent": ""
-  }
-]
-```
-
-To delete the file/link you uploaded:
-
-```bash
-# delete link
-curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
-     -d '{"file_path": "https://opea.dev.txt"}' \
-     -H "Content-Type: application/json"
-
-# delete file
-curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
-     -d '{"file_path": "nke-10k-2023.pdf"}' \
-     -H "Content-Type: application/json"
-
-# delete all uploaded files and links
-curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
-     -d '{"file_path": "all"}' \
-     -H "Content-Type: application/json"
-```
-
-8. Guardrails (Optional)
-
-```bash
-curl http://${host_ip}:9090/v1/guardrails\
-  -X POST \
-  -d '{"text":"How do you buy a tiger in the US?","parameters":{"max_new_tokens":32}}' \
-  -H 'Content-Type: application/json'
-```
-
-### Profile Microservices
-
-To further analyze MicroService Performance, users could follow the instructions to profile MicroServices.
-
-#### 1. vLLM backend Service
-
-Users could follow previous section to testing vLLM microservice or ChatQnA MegaService.  
- By default, vLLM profiling is not enabled. Users could start and stop profiling by following commands.
-
-##### Start vLLM profiling
-
-```bash
-curl http://${host_ip}:9009/start_profile \
-  -H "Content-Type: application/json" \
-  -d '{"model": ${LLM_MODEL_ID}}'
-```
-
-Users would see below docker logs from vllm-service if profiling is started correctly.
-
-```bash
-INFO api_server.py:361] Starting profiler...
-INFO api_server.py:363] Profiler started.
-INFO:     x.x.x.x:35940 - "POST /start_profile HTTP/1.1" 200 OK
-```
-
-After vLLM profiling is started, users could start asking questions and get responses from vLLM MicroService  
- or ChatQnA MicroService.
-
-##### Stop vLLM profiling
-
-By following command, users could stop vLLM profliing and generate a \*.pt.trace.json.gz file as profiling result  
- under /mnt folder in vllm-service docker instance.
-
-```bash
-# vLLM Service
-curl http://${host_ip}:9009/stop_profile \
-  -H "Content-Type: application/json" \
-  -d '{"model": ${LLM_MODEL_ID}}'
-```
-
-Users would see below docker logs from vllm-service if profiling is stopped correctly.
-
-```bash
-INFO api_server.py:368] Stopping profiler...
-INFO api_server.py:370] Profiler stopped.
-INFO:     x.x.x.x:41614 - "POST /stop_profile HTTP/1.1" 200 OK
-```
-
-After vllm profiling is stopped, users could use below command to get the \*.pt.trace.json.gz file under /mnt folder.
-
-```bash
-docker cp  vllm-service:/mnt/ .
-```
-
-##### Check profiling result
-
-Open a web browser and type "chrome://tracing" or "ui.perfetto.dev", and then load the json.gz file, you should be able  
- to see the vLLM profiling result as below diagram.
-![image](https://github.com/user-attachments/assets/487c52c8-d187-46dc-ab3a-43f21d657d41)
-
-![image](https://github.com/user-attachments/assets/e3c51ce5-d704-4eb7-805e-0d88b0c158e3)
-
-## 🚀 Launch the UI
-
-### Launch with origin port
-
-To access the frontend, open the following URL in your browser: http://{host_ip}:5173. By default, the UI runs on port 5173 internally. If you prefer to use a different host port to access the frontend, you can modify the port mapping in the `compose.yaml` file as shown below:
-
-```yaml
-  chatqna-gaudi-ui-server:
-    image: opea/chatqna-ui:latest
-    ...
-    ports:
-      - "80:5173"
-```
-
-### Launch with Nginx
-
-If you want to launch the UI using Nginx, open this URL: `http://${host_ip}:${NGINX_PORT}` in your browser to access the frontend.
-
-## 🚀 Launch the Conversational UI (Optional)
-
-To access the Conversational UI (react based) frontend, modify the UI service in the `compose.yaml` file. Replace `chatqna-gaudi-ui-server` service with the `chatqna-gaudi-conversation-ui-server` service as per the config below:
-
-```yaml
-chatqna-gaudi-conversation-ui-server:
-  image: opea/chatqna-conversation-ui:latest
-  container_name: chatqna-gaudi-conversation-ui-server
-  environment:
-    - APP_BACKEND_SERVICE_ENDPOINT=${BACKEND_SERVICE_ENDPOINT}
-    - APP_DATA_PREP_SERVICE_URL=${DATAPREP_SERVICE_ENDPOINT}
-  ports:
-    - "5174:80"
-  depends_on:
-    - chatqna-gaudi-backend-server
-  ipc: host
-  restart: always
-```
-
-Once the services are up, open the following URL in your browser: http://{host_ip}:5174. By default, the UI runs on port 80 internally. If you prefer to use a different host port to access the frontend, you can modify the port mapping in the `compose.yaml` file as shown below:
-
-```yaml
-  chatqna-gaudi-conversation-ui-server:
-    image: opea/chatqna-conversation-ui:latest
-    ...
-    ports:
-      - "80:80"
-```
-
-![project-screenshot](../../../../assets/img/chat_ui_init.png)
-
-Here is an example of running ChatQnA:
-
-![project-screenshot](../../../../assets/img/chat_ui_response.png)
-
-Here is an example of running ChatQnA with Conversational UI (React):
-
-![project-screenshot](../../../../assets/img/conversation_ui_response.png)
+Overall, the strategic configuration of these services, through careful selection of models and resource allocation, enables a balanced and efficient deployment. This approach ensures that the ChatQnA pipeline can meet diverse operational needs, from high-performance language model processing to robust safety protocols, all while optimizing the use of available hardware resources.
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.telemetry.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.telemetry.yaml
@@ -7,7 +7,7 @@ services:
  tei-reranking-service:
    command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
  jaeger:
-    image: jaegertracing/all-in-one:latest
+    image: jaegertracing/all-in-one:1.67.0
    container_name: jaeger
    ports:
      - "16686:16686"
@@ -21,6 +21,67 @@ services:
      https_proxy: ${https_proxy}
      COLLECTOR_ZIPKIN_HOST_PORT: 9411
    restart: unless-stopped
+  prometheus:
+    image: prom/prometheus:v2.52.0
+    container_name: prometheus
+    user: root
+    volumes:
+      - ./prometheus.yaml:/etc/prometheus/prometheus.yaml
+      - ./prometheus_data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yaml'
+    ports:
+      - '9090:9090'
+    ipc: host
+    restart: unless-stopped
+  grafana:
+    image: grafana/grafana:11.0.0
+    container_name: grafana
+    volumes:
+      - ./grafana_data:/var/lib/grafana
+      - ./grafana/dashboards:/var/lib/grafana/dashboards
+      - ./grafana/provisioning:/etc/grafana/provisioning
+    user: root
+    environment:
+      GF_SECURITY_ADMIN_PASSWORD: admin
+      GF_RENDERING_CALLBACK_URL: http://grafana:3000/
+      GF_LOG_FILTERS: rendering:debug
+    depends_on:
+      - prometheus
+    ports:
+      - '3000:3000'
+    ipc: host
+    restart: unless-stopped
+  node-exporter:
+    image: prom/node-exporter
+    container_name: node-exporter
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.sysfs=/host/sys'
+      - --collector.filesystem.ignored-mount-points
+      - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
+    ports:
+      - 9100:9100
+    restart: always
+    deploy:
+      mode: global
+  gaudi-exporter:
+    image: vault.habana.ai/gaudi-metric-exporter/metric-exporter:1.19.2-32
+    container_name: gaudi-exporter
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+      - /dev:/dev
+    ports:
+      - 41612:41611
+    restart: always
+    deploy:
+      mode: global
  chatqna-gaudi-backend-server:
    environment:
      - ENABLE_OPEA_TELEMETRY=true
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -8,12 +8,19 @@ services:
    ports:
      - "6379:6379"
      - "8001:8001"
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 3s
+      retries: 10
  dataprep-redis-service:
    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
    container_name: dataprep-redis-server
    depends_on:
-      - redis-vector-db
-      - tei-embedding-service
+      redis-vector-db:
+        condition: service_healthy
+      tei-embedding-service:
+        condition: service_started
    ports:
      - "6007:5000"
    environment:
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_faqgen.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_faqgen.yaml
@@ -0,0 +1,198 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  redis-vector-db:
+    image: redis/redis-stack:7.2.0-v9
+    container_name: redis-vector-db
+    ports:
+      - "${CHATQNA_REDIS_VECTOR_PORT:-6379}:6379"
+      - "${CHATQNA_REDIS_VECTOR_INSIGHT_PORT:-8001}:8001"
+  dataprep-redis-service:
+    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
+    container_name: dataprep-redis-server
+    depends_on:
+      - redis-vector-db
+      - tei-embedding-service
+    ports:
+      - "6007:5000"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      REDIS_URL: redis://redis-vector-db:6379
+      REDIS_HOST: redis-vector-db
+      INDEX_NAME: ${INDEX_NAME}
+      TEI_ENDPOINT: http://tei-embedding-service:80
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LOGFLAG: ${LOGFLAG}
+  tei-embedding-service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    container_name: tei-embedding-gaudi-server
+    ports:
+      - "8090:80"
+    volumes:
+      - "${MODEL_CACHE:-./data}:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
+  retriever:
+    image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
+    container_name: retriever-redis-server
+    depends_on:
+      - redis-vector-db
+    ports:
+      - "7000:7000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      REDIS_URL: redis://redis-vector-db:6379
+      REDIS_HOST: redis-vector-db
+      INDEX_NAME: ${INDEX_NAME}
+      TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+    restart: unless-stopped
+  tei-reranking-service:
+    image: ghcr.io/huggingface/tei-gaudi:1.5.0
+    container_name: tei-reranking-gaudi-server
+    ports:
+      - "8808:80"
+    volumes:
+      - "${MODEL_CACHE:-./data}:/data"
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      MAX_WARMUP_SEQUENCE_LENGTH: 512
+    command: --model-id ${RERANK_MODEL_ID} --auto-truncate
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+    container_name: vllm-gaudi-server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-8007}:80
+    volumes:
+      - "${MODEL_CACHE:-./data}:/data"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HF_TOKEN}
+      HF_HOME: "/data"
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      NUM_CARDS: ${NUM_CARDS:-1}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+      VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+  llm-faqgen:
+    image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
+    container_name: llm-faqgen-server
+    depends_on:
+      vllm-service:
+        condition: service_healthy
+    ports:
+      - ${LLM_SERVER_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      HF_TOKEN: ${HF_TOKEN}
+      FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME:-OpeaFaqGenvLLM}
+      LOGFLAG: ${LOGFLAG:-False}
+    restart: unless-stopped
+  chatqna-gaudi-backend-server:
+    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
+    container_name: chatqna-gaudi-backend-server
+    depends_on:
+      - redis-vector-db
+      - tei-embedding-service
+      - retriever
+      - tei-reranking-service
+      - vllm-service
+      - llm-faqgen
+    ports:
+      - ${CHATQNA_BACKEND_PORT:-8888}:8888
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=chatqna-gaudi-backend-server
+      - EMBEDDING_SERVER_HOST_IP=tei-embedding-service
+      - EMBEDDING_SERVER_PORT=80
+      - RETRIEVER_SERVICE_HOST_IP=retriever
+      - RETRIEVER_SERVICE_PORT=7000
+      - RERANK_SERVER_HOST_IP=tei-reranking-service
+      - RERANK_SERVER_PORT=80
+      - LLM_SERVER_HOST_IP=llm-faqgen
+      - LLM_SERVER_PORT=9000
+      - LLM_MODEL=${LLM_MODEL_ID}
+      - LOGFLAG=${LOGFLAG}
+      - CHATQNA_TYPE=${CHATQNA_TYPE:-CHATQNA_FAQGEN}
+    ipc: host
+    restart: always
+  chatqna-gaudi-ui-server:
+    image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest}
+    container_name: chatqna-gaudi-ui-server
+    depends_on:
+      - chatqna-gaudi-backend-server
+    ports:
+      - ${CHATQNA_FRONTEND_SERVICE_PORT:-5173}:5173
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+    ipc: host
+    restart: always
+  chatqna-gaudi-nginx-server:
+    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
+    container_name: chatqna-gaudi-nginx-server
+    depends_on:
+      - chatqna-gaudi-backend-server
+      - chatqna-gaudi-ui-server
+    ports:
+      - "${NGINX_PORT:-80}:80"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - FRONTEND_SERVICE_IP=chatqna-gaudi-ui-server
+      - FRONTEND_SERVICE_PORT=5173
+      - BACKEND_SERVICE_NAME=chatqna
+      - BACKEND_SERVICE_IP=chatqna-gaudi-backend-server
+      - BACKEND_SERVICE_PORT=8888
+      - DATAPREP_SERVICE_IP=dataprep-redis-service
+      - DATAPREP_SERVICE_PORT=5000
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_faqgen_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_faqgen_tgi.yaml
@@ -0,0 +1,203 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  redis-vector-db:
+    image: redis/redis-stack:7.2.0-v9
+    container_name: redis-vector-db
+    ports:
+      - "${CHATQNA_REDIS_VECTOR_PORT:-6379}:6379"
+      - "${CHATQNA_REDIS_VECTOR_INSIGHT_PORT:-8001}:8001"
+  dataprep-redis-service:
+    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
+    container_name: dataprep-redis-server
+    depends_on:
+      - redis-vector-db
+      - tei-embedding-service
+    ports:
+      - "6007:5000"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      REDIS_URL: redis://redis-vector-db:6379
+      REDIS_HOST: redis-vector-db
+      INDEX_NAME: ${INDEX_NAME}
+      TEI_ENDPOINT: http://tei-embedding-service:80
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LOGFLAG: ${LOGFLAG}
+  tei-embedding-service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    container_name: tei-embedding-gaudi-server
+    ports:
+      - "8090:80"
+    volumes:
+      - "${MODEL_CACHE:-./data}:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
+  retriever:
+    image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
+    container_name: retriever-redis-server
+    depends_on:
+      - redis-vector-db
+    ports:
+      - "7000:7000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      REDIS_URL: redis://redis-vector-db:6379
+      REDIS_HOST: redis-vector-db
+      INDEX_NAME: ${INDEX_NAME}
+      TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+    restart: unless-stopped
+  tei-reranking-service:
+    image: ghcr.io/huggingface/tei-gaudi:1.5.0
+    container_name: tei-reranking-gaudi-server
+    ports:
+      - "8808:80"
+    volumes:
+      - "${MODEL_CACHE:-./data}:/data"
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      MAX_WARMUP_SEQUENCE_LENGTH: 512
+    command: --model-id ${RERANK_MODEL_ID} --auto-truncate
+  tgi-service:
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
+    container_name: tgi-gaudi-server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-8008}:80
+    volumes:
+      - "${MODEL_CACHE:-./data}:/data"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      PREFILL_BATCH_BUCKET_SIZE: 1
+      BATCH_BUCKET_SIZE: 8
+      ENABLE_HPU_GRAPH: true
+      LIMIT_HPU_GRAPH: true
+      USE_FLASH_ATTENTION: true
+      FLASH_ATTENTION_RECOMPUTE: true
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS:-4096}
+      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS:-8192}
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model-id ${LLM_MODEL_ID} --max-batch-total-tokens 65536 --max-batch-prefill-tokens 4096
+  llm-faqgen:
+    image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
+    container_name: llm-faqgen-server
+    depends_on:
+      tgi-service:
+        condition: service_healthy
+    ports:
+      - ${LLM_SERVER_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      HF_TOKEN: ${HF_TOKEN}
+      FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME:-OpeaFaqGenTgi}
+      LOGFLAG: ${LOGFLAG:-False}
+    restart: unless-stopped
+  chatqna-gaudi-backend-server:
+    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
+    container_name: chatqna-gaudi-backend-server
+    depends_on:
+      - redis-vector-db
+      - tei-embedding-service
+      - retriever
+      - tei-reranking-service
+      - tgi-service
+      - llm-faqgen
+    ports:
+      - ${CHATQNA_BACKEND_PORT:-8888}:8888
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=chatqna-gaudi-backend-server
+      - EMBEDDING_SERVER_HOST_IP=tei-embedding-service
+      - EMBEDDING_SERVER_PORT=80
+      - RETRIEVER_SERVICE_HOST_IP=retriever
+      - RETRIEVER_SERVICE_PORT=7000
+      - RERANK_SERVER_HOST_IP=tei-reranking-service
+      - RERANK_SERVER_PORT=80
+      - LLM_SERVER_HOST_IP=llm-faqgen
+      - LLM_SERVER_PORT=9000
+      - LLM_MODEL=${LLM_MODEL_ID}
+      - LOGFLAG=${LOGFLAG}
+      - CHATQNA_TYPE=${CHATQNA_TYPE:-CHATQNA_FAQGEN}
+    ipc: host
+    restart: always
+  chatqna-gaudi-ui-server:
+    image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest}
+    container_name: chatqna-gaudi-ui-server
+    depends_on:
+      - chatqna-gaudi-backend-server
+    ports:
+      - ${CHATQNA_FRONTEND_SERVICE_PORT:-5173}:5173
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+    ipc: host
+    restart: always
+  chatqna-gaudi-nginx-server:
+    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
+    container_name: chatqna-gaudi-nginx-server
+    depends_on:
+      - chatqna-gaudi-backend-server
+      - chatqna-gaudi-ui-server
+    ports:
+      - "${NGINX_PORT:-80}:80"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - FRONTEND_SERVICE_IP=chatqna-gaudi-ui-server
+      - FRONTEND_SERVICE_PORT=5173
+      - BACKEND_SERVICE_NAME=chatqna
+      - BACKEND_SERVICE_IP=chatqna-gaudi-backend-server
+      - BACKEND_SERVICE_PORT=8888
+      - DATAPREP_SERVICE_IP=dataprep-redis-service
+      - DATAPREP_SERVICE_PORT=5000
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
@@ -141,7 +141,7 @@ services:
    ipc: host
    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
  chatqna-gaudi-backend-server:
-    image: ${REGISTRY:-opea}/chatqna-guardrails:${TAG:-latest}
+    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-gaudi-guardrails-server
    depends_on:
      - redis-vector-db
@@ -169,6 +169,7 @@ services:
      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
      - LLM_MODEL=${LLM_MODEL_ID}
      - LOGFLAG=${LOGFLAG}
+      - CHATQNA_TYPE=${CHATQNA_TYPE:-CHATQNA_GUARDRAILS}
    ipc: host
    restart: always
  chatqna-gaudi-ui-server:
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.telemetry.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.telemetry.yaml
@@ -9,7 +9,7 @@ services:
  tgi-service:
    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
  jaeger:
-    image: jaegertracing/all-in-one:latest
+    image: jaegertracing/all-in-one:1.67.0
    container_name: jaeger
    ports:
      - "16686:16686"
@@ -23,6 +23,67 @@ services:
      https_proxy: ${https_proxy}
      COLLECTOR_ZIPKIN_HOST_PORT: 9411
    restart: unless-stopped
+  prometheus:
+    image: prom/prometheus:v2.52.0
+    container_name: prometheus
+    user: root
+    volumes:
+      - ./prometheus.yaml:/etc/prometheus/prometheus.yaml
+      - ./prometheus_data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yaml'
+    ports:
+      - '9090:9090'
+    ipc: host
+    restart: unless-stopped
+  grafana:
+    image: grafana/grafana:11.0.0
+    container_name: grafana
+    volumes:
+      - ./grafana_data:/var/lib/grafana
+      - ./grafana/dashboards:/var/lib/grafana/dashboards
+      - ./grafana/provisioning:/etc/grafana/provisioning
+    user: root
+    environment:
+      GF_SECURITY_ADMIN_PASSWORD: admin
+      GF_RENDERING_CALLBACK_URL: http://grafana:3000/
+      GF_LOG_FILTERS: rendering:debug
+    depends_on:
+      - prometheus
+    ports:
+      - '3000:3000'
+    ipc: host
+    restart: unless-stopped
+  node-exporter:
+    image: prom/node-exporter
+    container_name: node-exporter
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.sysfs=/host/sys'
+      - --collector.filesystem.ignored-mount-points
+      - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)"
+    ports:
+      - 9100:9100
+    restart: always
+    deploy:
+      mode: global
+  gaudi-exporter:
+    image: vault.habana.ai/gaudi-metric-exporter/metric-exporter:1.19.2-32
+    container_name: gaudi-exporter
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+      - /dev:/dev
+    ports:
+      - 41612:41611
+    restart: always
+    deploy:
+      mode: global
  chatqna-gaudi-backend-server:
    environment:
      - ENABLE_OPEA_TELEMETRY=true
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
@@ -81,7 +81,7 @@ services:
    ipc: host
    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
  chatqna-gaudi-backend-server:
-    image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
+    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-gaudi-backend-server
    depends_on:
      - redis-vector-db
@@ -102,6 +102,7 @@ services:
      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
      - LLM_MODEL=${LLM_MODEL_ID}
      - LOGFLAG=${LOGFLAG}
+      - CHATQNA_TYPE=${CHATQNA_TYPE:-CHATQNA_NO_RERANK}
    ipc: host
    restart: always
  chatqna-gaudi-ui-server:
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/grafana/dashboards/download_opea_dashboard.sh
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/grafana/dashboards/download_opea_dashboard.sh
@@ -0,0 +1,7 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/vllm_grafana.json
+wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/tgi_grafana.json
+wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/node_grafana.json
+wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/gaudi_grafana.json
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/dashboards/local.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/dashboards/local.yaml
@@ -0,0 +1,14 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: 1
+
+providers:
+- name: 'default'
+  orgId: 1
+  folder: ''
+  type: file
+  disableDeletion: false
+  updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards
+  options:
+    path: /var/lib/grafana/dashboards
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/datasources/datasource.yml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/datasources/datasource.yml
@@ -0,0 +1,54 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# config file version
+apiVersion: 1
+
+# list of datasources that should be deleted from the database
+deleteDatasources:
+  - name: Prometheus
+    orgId: 1
+
+# list of datasources to insert/update depending
+# what's available in the database
+datasources:
+  # <string, required> name of the datasource. Required
+- name: Prometheus
+  # <string, required> datasource type. Required
+  type: prometheus
+  # <string, required> access mode. direct or proxy. Required
+  access: proxy
+  # <int> org id. will default to orgId 1 if not specified
+  orgId: 1
+  # <string> url
+  url: http://prometheus:9090
+  # <string> database password, if used
+  password:
+  # <string> database user, if used
+  user:
+  # <string> database name, if used
+  database:
+  # <bool> enable/disable basic auth
+  basicAuth: false
+  # <string> basic auth username, if used
+  basicAuthUser:
+  # <string> basic auth password, if used
+  basicAuthPassword:
+  # <bool> enable/disable with credentials headers
+  withCredentials:
+  # <bool> mark as default datasource. Max one per org
+  isDefault: true
+  # <map> fields that will be converted to json and stored in json_data
+  jsonData:
+     httpMethod: GET
+     graphiteVersion: "1.1"
+     tlsAuth: false
+     tlsAuthWithCACert: false
+  # <string> json object of data that will be encrypted.
+  secureJsonData:
+    tlsCACert: "..."
+    tlsClientCert: "..."
+    tlsClientKey: "..."
+  version: 1
+  # <bool> allow users to edit datasources from the UI.
+  editable: true
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/prometheus.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/prometheus.yaml
@@ -0,0 +1,47 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+# [IP_ADDR]:{PORT_OUTSIDE_CONTAINER} -> {PORT_INSIDE_CONTAINER} / {PROTOCOL}
+global:
+  scrape_interval: 5s
+  external_labels:
+    monitor: "my-monitor"
+scrape_configs:
+  - job_name: "prometheus"
+    static_configs:
+      - targets: ["prometheus:9090"]
+  - job_name: "vllm"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["vllm-gaudi-server:80"]
+  - job_name: "tgi"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["tgi-gaudi-server:80"]
+  - job_name: "tei-embedding"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["tei-embedding-gaudi-server:80"]
+  - job_name: "tei-reranking"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["tei-reranking-gaudi-server:80"]
+  - job_name: "retriever"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["retriever:7000"]
+  - job_name: "dataprep-redis-service"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["dataprep-redis-service:5000"]
+  - job_name: "chatqna-backend-server"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["chatqna-gaudi-backend-server:8888"]
+  - job_name: "prometheus-node-exporter"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["node-exporter:9100"]
+  - job_name: "prometheus-gaudi-exporter"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["gaudi-exporter:41611"]
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -1,21 +1,94 @@
-#!/usr/bin/env bash
+#/usr/bin/env bash

 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
+
+# Function to prompt for input and set environment variables
+prompt_for_env_var() {
+  local var_name="$1"
+  local prompt_message="$2"
+  local default_value="$3"
+  local mandatory="$4"
+
+  if [[ "$mandatory" == "true" ]]; then
+    while [[ -z "$value" ]]; do
+      read -p "$prompt_message [default: \"${default_value}\"]: " value
+      if [[ -z "$value" ]]; then
+        echo "Input cannot be empty. Please try again."
+      fi
+    done
+  else
+    read -p "$prompt_message [default: \"${default_value}\"]: " value
+  fi
+
+  if [[ "$value" == "" ]]; then
+      export "$var_name"="$default_value"
+  else
+      export "$var_name"="$value"
+  fi
+}
+
 pushd "../../../../../" > /dev/null
 source .set_env.sh
 popd > /dev/null

+# Prompt the user for each required environment variable
+prompt_for_env_var "EMBEDDING_MODEL_ID" "Enter the EMBEDDING_MODEL_ID" "BAAI/bge-base-en-v1.5" false
+prompt_for_env_var "HUGGINGFACEHUB_API_TOKEN" "Enter the HUGGINGFACEHUB_API_TOKEN" "" true
+prompt_for_env_var "RERANK_MODEL_ID" "Enter the RERANK_MODEL_ID" "BAAI/bge-reranker-base" false
+prompt_for_env_var "LLM_MODEL_ID" "Enter the LLM_MODEL_ID" "meta-llama/Meta-Llama-3-8B-Instruct" false
+prompt_for_env_var "INDEX_NAME" "Enter the INDEX_NAME" "rag-redis" false
+prompt_for_env_var "NUM_CARDS" "Enter the number of Gaudi devices" "1" false
+prompt_for_env_var "host_ip" "Enter the host_ip" "$(curl ifconfig.me)" false

-export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
-export RERANK_MODEL_ID="BAAI/bge-reranker-base"
-export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
-export INDEX_NAME="rag-redis"
-export NUM_CARDS=1
-# Set it as a non-null string, such as true, if you want to enable logging facility,
-# otherwise, keep it as "" to disable it.
-export LOGFLAG=""
-# Set OpenTelemetry Tracing Endpoint
-export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
-export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
-export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
+#Query for enabling http_proxy
+prompt_for_env_var "http_proxy" "Enter the http_proxy." "" false
+
+#Query for enabling https_proxy
+prompt_for_env_var "https_proxy" "Enter the https_proxy." "" false
+
+#Query for enabling no_proxy
+prompt_for_env_var "no_proxy" "Enter the no_proxy." "" false
+
+# Query for enabling logging
+read -p "Enable logging? (yes/no): " logging && logging=$(echo "$logging" | tr '[:upper:]' '[:lower:]')
+if [[ "$logging" == "yes" || "$logging" == "y" ]]; then
+  export LOGFLAG=true
+else
+  export LOGFLAG=false
+fi
+
+# Query for enabling OpenTelemetry Tracing Endpoint
+read -p "Enable OpenTelemetry Tracing Endpoint? (yes/no): " telemetry && telemetry=$(echo "$telemetry" | tr '[:upper:]' '[:lower:]')
+if [[ "$telemetry" == "yes" || "$telemetry" == "y" ]]; then
+    export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
+    telemetry_flag=true
+else
+    telemetry_flag=false
+fi
+
+# Generate the .env file
+cat <<EOF > .env
+#!/bin/bash
+# Set all required ENV values
+export TAG=${TAG}
+export EMBEDDING_MODEL_ID=${EMBEDDING_MODEL_ID}
+export HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN
+export RERANK_MODEL_ID=${RERANK_MODEL_ID}
+export LLM_MODEL_ID=${LLM_MODEL_ID}
+export INDEX_NAME=${INDEX_NAME}
+export NUM_CARDS=${NUM_CARDS}
+export host_ip=${host_ip}
+export http_proxy=${http_proxy}
+export https_proxy=${https_proxy}
+export LOGFLAG=${LOGFLAG}
+export JAEGER_IP=${JAEGER_IP}
+export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}
+export TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
+export no_proxy="${no_proxy},chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails,jaeger,prometheus,grafana,gaudi-exporter,node-exporter,$JAEGER_IP"
+EOF
+
+echo ".env file has been created with the following content:"
+cat .env
--- a/ChatQnA/docker_image_build/build.yaml
+++ b/ChatQnA/docker_image_build/build.yaml
@@ -11,18 +11,6 @@ services:
      context: ../
      dockerfile: ./Dockerfile
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
-  chatqna-guardrails:
-    build:
-      context: ../
-      dockerfile: ./Dockerfile.guardrails
-    extends: chatqna
-    image: ${REGISTRY:-opea}/chatqna-guardrails:${TAG:-latest}
-  chatqna-without-rerank:
-    build:
-      context: ../
-      dockerfile: ./Dockerfile.without_rerank
-    extends: chatqna
-    image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
  chatqna-ui:
    build:
      context: ../ui
@@ -59,6 +47,12 @@ services:
      dockerfile: comps/llms/src/text-generation/Dockerfile
    extends: chatqna
    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
+  llm-faqgen:
+    build:
+      context: GenAIComps
+      dockerfile: comps/llms/src/faq-generation/Dockerfile
+    extends: chatqna
+    image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
  dataprep:
    build:
      context: GenAIComps
--- a/ChatQnA/entrypoint.sh
+++ b/ChatQnA/entrypoint.sh
@@ -0,0 +1,14 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+chatqna_arg=$CHATQNA_TYPE
+
+if [[ $chatqna_arg == "CHATQNA_FAQGEN" ]]; then
+    python chatqna.py --faqgen
+elif [[ $chatqna_arg == "CHATQNA_NO_RERANK" ]]; then
+    python chatqna.py --without-rerank
+elif [[ $chatqna_arg == "CHATQNA_GUARDRAILS" ]]; then
+    python chatqna.py --with-guardrails
+else
+    python chatqna.py
+fi
--- a/ChatQnA/tests/test_compose_faqgen_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_faqgen_on_gaudi.sh
@@ -0,0 +1,270 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"/data/cache"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi
+    cd $WORKPATH/docker_image_build
+    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+    git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork
+    VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+    git checkout ${VLLM_VER} &> /dev/null && cd ../
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="chatqna chatqna-ui dataprep retriever llm-faqgen vllm-gaudi nginx"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    docker pull ghcr.io/huggingface/tei-gaudi:1.5.0
+    docker images && sleep 1s
+}
+
+function start_services() {
+    cd $WORKPATH/docker_compose/intel/hpu/gaudi
+    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
+    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export NUM_CARDS=1
+    export INDEX_NAME="rag-redis"
+    export host_ip=${ip_address}
+    export LLM_ENDPOINT_PORT=8010
+    export LLM_SERVER_PORT=9001
+    export CHATQNA_BACKEND_PORT=8888
+    export CHATQNA_REDIS_VECTOR_PORT=6377
+    export CHATQNA_REDIS_VECTOR_INSIGHT_PORT=8006
+    export CHATQNA_FRONTEND_SERVICE_PORT=5175
+    export NGINX_PORT=80
+    export FAQGen_COMPONENT_NAME="OpeaFaqGenvLLM"
+    export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+    export HF_TOKEN=${HF_TOKEN}
+    export VLLM_SKIP_WARMUP=true
+    export LOGFLAG=True
+    export http_proxy=${http_proxy}
+    export https_proxy=${https_proxy}
+    export no_proxy="${ip_address},redis-vector-db,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails,llm-faqgen,chatqna-gaudi-backend-server,chatqna-gaudi-ui-server,chatqna-gaudi-nginx-server"
+
+    # Start Docker Containers
+    docker compose -f compose_faqgen.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+    sleep 30s
+}
+
+function validate_service() {
+    local URL="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+    local DOCKER_NAME="$4"
+    local INPUT_DATA="$5"
+
+    if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then
+        cd $LOG_PATH
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
+    else
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+    fi
+    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+
+    docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+
+    # check response status
+    if [ "$HTTP_STATUS" -ne "200" ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+    echo "Response"
+    echo $RESPONSE_BODY
+    echo "Expected Result"
+    echo $EXPECTED_RESULT
+    # check response body
+    if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
+    fi
+
+    sleep 1s
+}
+
+function validate_microservices() {
+    # Check if the microservices are running correctly.
+
+    # tei for embedding service
+    validate_service \
+        "${ip_address}:8090/embed" \
+        "[[" \
+        "tei-embedding" \
+        "tei-embedding-gaudi-server" \
+        '{"inputs":"What is Deep Learning?"}'
+
+    sleep 1m # retrieval can't curl as expected, try to wait for more time
+
+    # test /v1/dataprep upload file
+    echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/ingest" \
+        "Data preparation succeeded" \
+        "dataprep_upload_file" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep upload link
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/ingest" \
+        "Data preparation succeeded" \
+        "dataprep_upload_link" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep/get_file
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/get" \
+        '{"name":' \
+        "dataprep_get" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep/delete_file
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/delete" \
+        '{"status":true}' \
+        "dataprep_del" \
+        "dataprep-redis-server"
+
+    # retrieval microservice
+    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
+    validate_service \
+        "${ip_address}:7000/v1/retrieval" \
+        " " \
+        "retrieval" \
+        "retriever-redis-server" \
+        "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
+
+    # tei for rerank microservice
+    echo "validate tei..."
+    validate_service \
+        "${ip_address}:8808/rerank" \
+        '{"index":1,"score":' \
+        "tei-rerank" \
+        "tei-reranking-gaudi-server" \
+        '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
+
+    # vllm for llm service
+    echo "validate vllm..."
+    validate_service \
+        "${ip_address}:${LLM_ENDPOINT_PORT}/v1/chat/completions" \
+        "content" \
+        "vllm-llm" \
+        "vllm-gaudi-server" \
+        '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}'
+
+    # faqgen llm microservice
+    echo "validate llm-faqgen..."
+    validate_service \
+        "${ip_address}:${LLM_SERVER_PORT}/v1/faqgen" \
+        "text" \
+        "llm" \
+        "llm-faqgen-server" \
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+}
+
+function validate_megaservice() {
+    # Curl the Mega Service
+    validate_service \
+        "${ip_address}:${CHATQNA_BACKEND_PORT}/v1/chatqna" \
+        "Embed" \
+        "chatqna-megaservice" \
+        "chatqna-gaudi-backend-server" \
+        '{"messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32}'
+
+    validate_service \
+        "${ip_address}:${CHATQNA_BACKEND_PORT}/v1/chatqna" \
+        "Embed" \
+        "chatqna-megaservice" \
+        "chatqna-gaudi-backend-server" \
+        '{"messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32,"stream":false}'
+
+}
+
+function validate_frontend() {
+    cd $WORKPATH/ui/svelte
+    local conda_env_name="OPEA_e2e"
+    export PATH=${HOME}/miniforge3/bin/:$PATH
+    if conda info --envs | grep -q "$conda_env_name"; then
+        echo "$conda_env_name exist!"
+    else
+        conda create -n ${conda_env_name} python=3.12 -y
+    fi
+    source activate ${conda_env_name}
+
+    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+
+    conda install -c conda-forge nodejs=22.6.0 -y
+    npm install && npm ci && npx playwright install --with-deps
+    node -v && npm -v && pip list
+
+    exit_status=0
+    npx playwright test || exit_status=$?
+
+    if [ $exit_status -ne 0 ]; then
+        echo "[TEST INFO]: ---------frontend test failed---------"
+        exit $exit_status
+    else
+        echo "[TEST INFO]: ---------frontend test passed---------"
+    fi
+}
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/intel/hpu/gaudi
+    docker compose -f compose_faqgen.yaml  down
+}
+
+function main() {
+
+    stop_docker
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_time=$(date +%s)
+    start_services
+    end_time=$(date +%s)
+    duration=$((end_time-start_time))
+    echo "Mega service start duration is $duration s"
+
+    validate_microservices
+    validate_megaservice
+    validate_frontend
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/ChatQnA/tests/test_compose_faqgen_on_rocm.sh
+++ b/ChatQnA/tests/test_compose_faqgen_on_rocm.sh
@@ -0,0 +1,292 @@
+#!/bin/bash
+# Copyright (C) 2024 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"/var/opea/chatqna-service/data"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+export HOST_IP=${ip_address}
+export CHATQNA_TGI_SERVICE_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
+export CHATQNA_EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+export CHATQNA_RERANK_MODEL_ID="BAAI/bge-reranker-base"
+export CHATQNA_LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+export CHATQNA_TGI_SERVICE_PORT=9009
+export CHATQNA_TEI_EMBEDDING_PORT=8090
+export CHATQNA_TEI_EMBEDDING_ENDPOINT="http://${HOST_IP}:${CHATQNA_TEI_EMBEDDING_PORT}"
+export CHATQNA_TEI_RERANKING_PORT=8808
+export CHATQNA_REDIS_VECTOR_PORT=6379
+export CHATQNA_REDIS_VECTOR_INSIGHT_PORT=8001
+export CHATQNA_REDIS_DATAPREP_PORT=6007
+export CHATQNA_REDIS_RETRIEVER_PORT=7000
+export CHATQNA_LLM_FAQGEN_PORT=18010
+export CHATQNA_INDEX_NAME="rag-redis"
+export CHATQNA_MEGA_SERVICE_HOST_IP=${HOST_IP}
+export CHATQNA_RETRIEVER_SERVICE_HOST_IP=${HOST_IP}
+export CHATQNA_BACKEND_SERVICE_ENDPOINT="http://127.0.0.1:${CHATQNA_BACKEND_SERVICE_PORT}/v1/chatqna"
+export CHATQNA_DATAPREP_SERVICE_ENDPOINT="http://127.0.0.1:${CHATQNA_REDIS_DATAPREP_PORT}/v1/dataprep/ingest"
+export CHATQNA_DATAPREP_GET_FILE_ENDPOINT="http://127.0.0.1:${CHATQNA_REDIS_DATAPREP_PORT}/v1/dataprep/get"
+export CHATQNA_DATAPREP_DELETE_FILE_ENDPOINT="http://127.0.0.1:${CHATQNA_REDIS_DATAPREP_PORT}/v1/dataprep/delete"
+export CHATQNA_FRONTEND_SERVICE_IP=${HOST_IP}
+export CHATQNA_FRONTEND_SERVICE_PORT=15173
+export CHATQNA_BACKEND_SERVICE_NAME=chatqna
+export CHATQNA_BACKEND_SERVICE_IP=${HOST_IP}
+export CHATQNA_BACKEND_SERVICE_PORT=8888
+export CHATQNA_REDIS_URL="redis://${HOST_IP}:${CHATQNA_REDIS_VECTOR_PORT}"
+export CHATQNA_EMBEDDING_SERVICE_HOST_IP=${HOST_IP}
+export CHATQNA_RERANK_SERVICE_HOST_IP=${HOST_IP}
+export CHATQNA_LLM_SERVICE_HOST_IP=${HOST_IP}
+export CHATQNA_NGINX_PORT=80
+export CHATQNA_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export PATH="~/miniconda3/bin:$PATH"
+
+function build_docker_images() {
+    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi
+
+    cd $WORKPATH/docker_image_build
+    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="chatqna chatqna-ui dataprep retriever llm-faqgen nginx"
+    docker compose -f build.yaml build ${service_list} --no-cache > "${LOG_PATH}"/docker_image_build.log
+
+    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+
+    docker images && sleep 1s
+}
+
+function start_services() {
+    cd "$WORKPATH"/docker_compose/amd/gpu/rocm
+
+    # Start Docker Containers
+    docker compose -f compose_faqgen.yaml up -d > "${LOG_PATH}"/start_services_with_compose.log
+
+    n=0
+    until [[ "$n" -ge 160 ]]; do
+        docker logs chatqna-tgi-server > "${LOG_PATH}"/tgi_service_start.log
+        if grep -q Connected "${LOG_PATH}"/tgi_service_start.log; then
+            break
+        fi
+        sleep 5s
+        n=$((n+1))
+    done
+
+    echo "all containers start!"
+}
+
+function validate_service() {
+    local URL="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+    local DOCKER_NAME="$4"
+    local INPUT_DATA="$5"
+
+    if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then
+        cd "$LOG_PATH"
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
+    else
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+    fi
+    HTTP_STATUS=$(echo "$HTTP_RESPONSE" | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo "$HTTP_RESPONSE" | sed -e 's/HTTPSTATUS\:.*//g')
+
+    docker logs "${DOCKER_NAME}" >> "${LOG_PATH}"/"${SERVICE_NAME}".log
+
+    # check response status
+    if [ "$HTTP_STATUS" -ne "200" ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+    # check response body
+    if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
+    fi
+
+    sleep 1s
+}
+
+function validate_microservices() {
+    # Check if the microservices are running correctly.
+
+    # tei for embedding service
+    validate_service \
+        "${ip_address}:8090/embed" \
+        "[[" \
+        "tei-embedding" \
+        "chatqna-tei-embedding-server" \
+        '{"inputs":"What is Deep Learning?"}'
+
+    sleep 1m # retrieval can't curl as expected, try to wait for more time
+
+    # test /v1/dataprep/ingest upload file
+    echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > "$LOG_PATH"/dataprep_file.txt
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/ingest" \
+        "Data preparation succeeded" \
+        "dataprep_upload_file" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep/ingest upload link
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/ingest" \
+        "Data preparation succeeded" \
+        "dataprep_upload_link" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep/get
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/get" \
+        '{"name":' \
+        "dataprep_get" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep/delete
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/delete" \
+        '{"status":true}' \
+        "dataprep_del" \
+        "dataprep-redis-server"
+
+    # retrieval microservice
+    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
+    validate_service \
+        "${ip_address}:7000/v1/retrieval" \
+        "retrieved_docs" \
+        "retrieval-microservice" \
+        "chatqna-retriever-redis-server" \
+        "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
+
+    # tei for rerank microservice
+    validate_service \
+        "${ip_address}:8808/rerank" \
+        '{"index":1,"score":' \
+        "tei-rerank" \
+        "chatqna-tei-reranking-server" \
+        '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
+
+    # tgi for llm service
+    validate_service \
+        "${ip_address}:9009/generate" \
+        "generated_text" \
+        "tgi-llm" \
+        "chatqna-tgi-server" \
+        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+
+    # faqgen llm microservice
+    echo "validate llm-faqgen..."
+    validate_service \
+        "${ip_address}:${CHATQNA_LLM_FAQGEN_PORT}/v1/faqgen" \
+        "text" \
+        "llm" \
+        "llm-faqgen-server" \
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+}
+
+function validate_megaservice() {
+    # Curl the Mega Service
+    validate_service \
+        "${ip_address}:8888/v1/chatqna" \
+        "Embed" \
+        "chatqna-megaservice" \
+        "chatqna-backend-server" \
+        '{"messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32}'
+
+    validate_service \
+        "${ip_address}:8888/v1/chatqna" \
+        "Embed" \
+        "chatqna-megaservice" \
+        "chatqna-backend-server" \
+        '{"messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32,"stream":false}'
+
+}
+
+function validate_frontend() {
+    echo "[ TEST INFO ]: --------- frontend test started ---------"
+    cd "$WORKPATH"/ui/svelte
+    local conda_env_name="OPEA_e2e"
+    export PATH=${HOME}/miniforge3/bin/:$PATH
+    if conda info --envs | grep -q "$conda_env_name"; then
+        echo "$conda_env_name exist!"
+    else
+        conda create -n ${conda_env_name} python=3.12 -y
+    fi
+    source activate ${conda_env_name}
+    echo "[ TEST INFO ]: --------- conda env activated ---------"
+
+    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+
+    conda install -c conda-forge nodejs=22.6.0 -y
+    npm install && npm ci && npx playwright install --with-deps
+    node -v && npm -v && pip list
+
+    exit_status=0
+    npx playwright test || exit_status=$?
+
+    if [ $exit_status -ne 0 ]; then
+        echo "[TEST INFO]: ---------frontend test failed---------"
+        exit $exit_status
+    else
+        echo "[TEST INFO]: ---------frontend test passed---------"
+    fi
+}
+
+function stop_docker() {
+    cd "$WORKPATH"/docker_compose/amd/gpu/rocm
+    docker compose -f compose_faqgen.yaml stop && docker compose rm -f
+}
+
+function main() {
+
+    stop_docker
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_time=$(date +%s)
+    start_services
+    end_time=$(date +%s)
+    duration=$((end_time-start_time))
+    echo "Mega service start duration is $duration s" && sleep 1s
+
+    validate_microservices
+    echo "==== microservices validated ===="
+    validate_megaservice
+    echo "==== megaservice validated ===="
+    validate_frontend
+    echo "==== frontend validated ===="
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/ChatQnA/tests/test_compose_faqgen_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_faqgen_on_xeon.sh
@@ -0,0 +1,270 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"/data/cache"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi
+    cd $WORKPATH/docker_image_build
+    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+    git clone https://github.com/vllm-project/vllm.git && cd vllm
+    VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+    echo "Check out vLLM tag ${VLLM_VER}"
+    git checkout ${VLLM_VER} &> /dev/null && cd ../
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="chatqna chatqna-ui dataprep retriever llm-faqgen vllm nginx"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    docker images && sleep 1s
+}
+
+function start_services() {
+    cd $WORKPATH/docker_compose/intel/cpu/xeon
+    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
+    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export INDEX_NAME="rag-redis"
+    export host_ip=${ip_address}
+    export LLM_ENDPOINT_PORT=8010
+    export LLM_SERVER_PORT=9001
+    export CHATQNA_BACKEND_PORT=8888
+    export CHATQNA_REDIS_VECTOR_PORT=6377
+    export CHATQNA_REDIS_VECTOR_INSIGHT_PORT=8006
+    export CHATQNA_FRONTEND_SERVICE_PORT=5175
+    export NGINX_PORT=80
+    export FAQGen_COMPONENT_NAME="OpeaFaqGenvLLM"
+    export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+    export HF_TOKEN=${HF_TOKEN}
+    export VLLM_SKIP_WARMUP=true
+    export LOGFLAG=True
+    export http_proxy=${http_proxy}
+    export https_proxy=${https_proxy}
+    export no_proxy="${ip_address},redis-vector-db,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails,llm-faqgen,chatqna-xeon-backend-server,chatqna-xeon-ui-server,chatqna-xeon-nginx-server"
+
+    # Start Docker Containers
+    docker compose -f compose_faqgen.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+    sleep 30s
+}
+
+function validate_service() {
+    local URL="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+    local DOCKER_NAME="$4"
+    local INPUT_DATA="$5"
+
+    if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then
+        cd $LOG_PATH
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
+    else
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+    fi
+    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+
+    docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+
+    # check response status
+    if [ "$HTTP_STATUS" -ne "200" ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+    echo "Response"
+    echo $RESPONSE_BODY
+    echo "Expected Result"
+    echo $EXPECTED_RESULT
+    # check response body
+    if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
+    fi
+
+    sleep 1s
+}
+
+function validate_microservices() {
+    # Check if the microservices are running correctly.
+    sleep 3m
+
+    # tei for embedding service
+    validate_service \
+        "${ip_address}:6006/embed" \
+        "[[" \
+        "tei-embedding" \
+        "tei-embedding-server" \
+        '{"inputs":"What is Deep Learning?"}'
+
+    sleep 1m # retrieval can't curl as expected, try to wait for more time
+
+    # test /v1/dataprep upload file
+    echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/ingest" \
+        "Data preparation succeeded" \
+        "dataprep_upload_file" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep upload link
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/ingest" \
+        "Data preparation succeeded" \
+        "dataprep_upload_link" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep/get_file
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/get" \
+        '{"name":' \
+        "dataprep_get" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep/delete_file
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/delete" \
+        '{"status":true}' \
+        "dataprep_del" \
+        "dataprep-redis-server"
+
+    # retrieval microservice
+    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
+    validate_service \
+        "${ip_address}:7000/v1/retrieval" \
+        " " \
+        "retrieval" \
+        "retriever-redis-server" \
+        "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
+
+    # tei for rerank microservice
+    echo "validate tei..."
+    validate_service \
+        "${ip_address}:8808/rerank" \
+        '{"index":1,"score":' \
+        "tei-rerank" \
+        "tei-reranking-server" \
+        '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
+
+    # vllm for llm service
+    echo "validate vllm..."
+    validate_service \
+        "${ip_address}:${LLM_ENDPOINT_PORT}/v1/chat/completions" \
+        "content" \
+        "vllm-llm" \
+        "vllm-server" \
+        '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}'
+
+    # faqgen llm microservice
+    echo "validate llm-faqgen..."
+    validate_service \
+        "${ip_address}:${LLM_SERVER_PORT}/v1/faqgen" \
+        "text" \
+        "llm" \
+        "llm-faqgen-server" \
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+}
+
+function validate_megaservice() {
+    # Curl the Mega Service
+    validate_service \
+        "${ip_address}:${CHATQNA_BACKEND_PORT}/v1/chatqna" \
+        "Embed" \
+        "chatqna-megaservice" \
+        "chatqna-xeon-backend-server" \
+        '{"messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32}'
+
+    validate_service \
+        "${ip_address}:${CHATQNA_BACKEND_PORT}/v1/chatqna" \
+        "Embed" \
+        "chatqna-megaservice" \
+        "chatqna-xeon-backend-server" \
+        '{"messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32,"stream":false}'
+
+}
+
+function validate_frontend() {
+    cd $WORKPATH/ui/svelte
+    local conda_env_name="OPEA_e2e"
+    export PATH=${HOME}/miniforge3/bin/:$PATH
+    if conda info --envs | grep -q "$conda_env_name"; then
+        echo "$conda_env_name exist!"
+    else
+        conda create -n ${conda_env_name} python=3.12 -y
+    fi
+    source activate ${conda_env_name}
+
+    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+
+    conda install -c conda-forge nodejs=22.6.0 -y
+    npm install && npm ci && npx playwright install --with-deps
+    node -v && npm -v && pip list
+
+    exit_status=0
+    npx playwright test || exit_status=$?
+
+    if [ $exit_status -ne 0 ]; then
+        echo "[TEST INFO]: ---------frontend test failed---------"
+        exit $exit_status
+    else
+        echo "[TEST INFO]: ---------frontend test passed---------"
+    fi
+}
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/intel/cpu/xeon
+    docker compose -f compose_faqgen.yaml  down
+}
+
+function main() {
+
+    stop_docker
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_time=$(date +%s)
+    start_services
+    end_time=$(date +%s)
+    duration=$((end_time-start_time))
+    echo "Mega service start duration is $duration s" && sleep 1s
+
+    validate_microservices
+    validate_megaservice
+    validate_frontend
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/ChatQnA/tests/test_compose_faqgen_tgi_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_faqgen_tgi_on_gaudi.sh
@@ -0,0 +1,266 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"/data/cache"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi
+    cd $WORKPATH/docker_image_build
+    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="chatqna chatqna-ui dataprep retriever llm-faqgen nginx"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    docker pull ghcr.io/huggingface/tei-gaudi:1.5.0
+    docker images && sleep 1s
+}
+
+function start_services() {
+    cd $WORKPATH/docker_compose/intel/hpu/gaudi
+    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
+    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export INDEX_NAME="rag-redis"
+    export host_ip=${ip_address}
+    export LLM_ENDPOINT_PORT=8010
+    export LLM_SERVER_PORT=9001
+    export CHATQNA_BACKEND_PORT=8888
+    export CHATQNA_REDIS_VECTOR_PORT=6377
+    export CHATQNA_REDIS_VECTOR_INSIGHT_PORT=8006
+    export CHATQNA_FRONTEND_SERVICE_PORT=5175
+    export NGINX_PORT=80
+    export FAQGen_COMPONENT_NAME="OpeaFaqGenTgi"
+    export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+    export HF_TOKEN=${HF_TOKEN}
+    export LOGFLAG=True
+    export http_proxy=${http_proxy}
+    export https_proxy=${https_proxy}
+    export no_proxy="${ip_address},redis-vector-db,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails,llm-faqgen,chatqna-gaudi-backend-server,chatqna-gaudi-ui-server,chatqna-gaudi-nginx-server"
+
+    # Start Docker Containers
+    docker compose -f compose_faqgen_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+    sleep 30s
+}
+
+function validate_service() {
+    local URL="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+    local DOCKER_NAME="$4"
+    local INPUT_DATA="$5"
+
+    if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then
+        cd $LOG_PATH
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
+    else
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+    fi
+    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+
+    docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+
+    # check response status
+    if [ "$HTTP_STATUS" -ne "200" ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+    echo "Response"
+    echo $RESPONSE_BODY
+    echo "Expected Result"
+    echo $EXPECTED_RESULT
+    # check response body
+    if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
+    fi
+
+    sleep 1s
+}
+
+function validate_microservices() {
+    # Check if the microservices are running correctly.
+
+    # tei for embedding service
+    validate_service \
+        "${ip_address}:8090/embed" \
+        "[[" \
+        "tei-embedding" \
+        "tei-embedding-gaudi-server" \
+        '{"inputs":"What is Deep Learning?"}'
+
+    sleep 1m # retrieval can't curl as expected, try to wait for more time
+
+    # test /v1/dataprep upload file
+    echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/ingest" \
+        "Data preparation succeeded" \
+        "dataprep_upload_file" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep upload link
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/ingest" \
+        "Data preparation succeeded" \
+        "dataprep_upload_link" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep/get_file
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/get" \
+        '{"name":' \
+        "dataprep_get" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep/delete_file
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/delete" \
+        '{"status":true}' \
+        "dataprep_del" \
+        "dataprep-redis-server"
+
+    # retrieval microservice
+    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
+    validate_service \
+        "${ip_address}:7000/v1/retrieval" \
+        " " \
+        "retrieval" \
+        "retriever-redis-server" \
+        "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
+
+    # tei for rerank microservice
+    echo "validate tei..."
+    validate_service \
+        "${ip_address}:8808/rerank" \
+        '{"index":1,"score":' \
+        "tei-rerank" \
+        "tei-reranking-gaudi-server" \
+        '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
+
+    # tgi for llm service
+    echo "validate tgi..."
+    validate_service \
+        "${ip_address}:${LLM_ENDPOINT_PORT}/v1/chat/completions" \
+        "content" \
+        "tgi-llm" \
+        "tgi-gaudi-server" \
+        '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}'
+
+    # faqgen llm microservice
+    echo "validate llm-faqgen..."
+    validate_service \
+        "${ip_address}:${LLM_SERVER_PORT}/v1/faqgen" \
+        "text" \
+        "llm" \
+        "llm-faqgen-server" \
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+}
+
+function validate_megaservice() {
+    # Curl the Mega Service
+    validate_service \
+        "${ip_address}:${CHATQNA_BACKEND_PORT}/v1/chatqna" \
+        "Embed" \
+        "chatqna-megaservice" \
+        "chatqna-gaudi-backend-server" \
+        '{"messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32}'
+
+    validate_service \
+        "${ip_address}:${CHATQNA_BACKEND_PORT}/v1/chatqna" \
+        "Embed" \
+        "chatqna-megaservice" \
+        "chatqna-gaudi-backend-server" \
+        '{"messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32,"stream":false}'
+
+}
+
+function validate_frontend() {
+    cd $WORKPATH/ui/svelte
+    local conda_env_name="OPEA_e2e"
+    export PATH=${HOME}/miniforge3/bin/:$PATH
+    if conda info --envs | grep -q "$conda_env_name"; then
+        echo "$conda_env_name exist!"
+    else
+        conda create -n ${conda_env_name} python=3.12 -y
+    fi
+    source activate ${conda_env_name}
+
+    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+
+    conda install -c conda-forge nodejs=22.6.0 -y
+    npm install && npm ci && npx playwright install --with-deps
+    node -v && npm -v && pip list
+
+    exit_status=0
+    npx playwright test || exit_status=$?
+
+    if [ $exit_status -ne 0 ]; then
+        echo "[TEST INFO]: ---------frontend test failed---------"
+        exit $exit_status
+    else
+        echo "[TEST INFO]: ---------frontend test passed---------"
+    fi
+}
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/intel/hpu/gaudi
+    docker compose -f compose_faqgen_tgi.yaml  down
+}
+
+function main() {
+
+    stop_docker
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_time=$(date +%s)
+    start_services
+    end_time=$(date +%s)
+    duration=$((end_time-start_time))
+    echo "Mega service start duration is $duration s"
+
+    validate_microservices
+    validate_megaservice
+    validate_frontend
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/ChatQnA/tests/test_compose_faqgen_tgi_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_faqgen_tgi_on_xeon.sh
@@ -0,0 +1,270 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"/data/cache"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi
+    cd $WORKPATH/docker_image_build
+    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+    git clone https://github.com/vllm-project/vllm.git && cd vllm
+    VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+    echo "Check out vLLM tag ${VLLM_VER}"
+    git checkout ${VLLM_VER} &> /dev/null && cd ../
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="chatqna chatqna-ui dataprep retriever llm-faqgen nginx"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    docker images && sleep 1s
+}
+
+function start_services() {
+    cd $WORKPATH/docker_compose/intel/cpu/xeon
+    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
+    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export INDEX_NAME="rag-redis"
+    export host_ip=${ip_address}
+    export LLM_ENDPOINT_PORT=8010
+    export LLM_SERVER_PORT=9001
+    export CHATQNA_BACKEND_PORT=8888
+    export CHATQNA_REDIS_VECTOR_PORT=6377
+    export CHATQNA_REDIS_VECTOR_INSIGHT_PORT=8006
+    export CHATQNA_FRONTEND_SERVICE_PORT=5175
+    export NGINX_PORT=80
+    export FAQGen_COMPONENT_NAME="OpeaFaqGenTgi"
+    export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+    export HF_TOKEN=${HF_TOKEN}
+    export LOGFLAG=True
+    export http_proxy=${http_proxy}
+    export https_proxy=${https_proxy}
+    export no_proxy="${ip_address},redis-vector-db,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails,llm-faqgen,chatqna-xeon-backend-server,chatqna-xeon-ui-server,chatqna-xeon-nginx-server"
+
+    # Start Docker Containers
+    docker compose -f compose_faqgen_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+    sleep 30s
+}
+
+function validate_service() {
+    local URL="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+    local DOCKER_NAME="$4"
+    local INPUT_DATA="$5"
+
+    if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then
+        cd $LOG_PATH
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
+    else
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+    fi
+    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+
+    docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+
+    # check response status
+    if [ "$HTTP_STATUS" -ne "200" ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+    echo "Response"
+    echo $RESPONSE_BODY
+    echo "Expected Result"
+    echo $EXPECTED_RESULT
+    # check response body
+    if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
+    fi
+
+    sleep 1s
+}
+
+function validate_microservices() {
+    # Check if the microservices are running correctly.
+    sleep 3m
+
+    # tei for embedding service
+    validate_service \
+        "${ip_address}:6006/embed" \
+        "[[" \
+        "tei-embedding" \
+        "tei-embedding-server" \
+        '{"inputs":"What is Deep Learning?"}'
+
+    sleep 1m # retrieval can't curl as expected, try to wait for more time
+
+    # test /v1/dataprep upload file
+    echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/ingest" \
+        "Data preparation succeeded" \
+        "dataprep_upload_file" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep upload link
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/ingest" \
+        "Data preparation succeeded" \
+        "dataprep_upload_link" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep/get_file
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/get" \
+        '{"name":' \
+        "dataprep_get" \
+        "dataprep-redis-server"
+
+    # test /v1/dataprep/delete_file
+    validate_service \
+        "http://${ip_address}:6007/v1/dataprep/delete" \
+        '{"status":true}' \
+        "dataprep_del" \
+        "dataprep-redis-server"
+
+    # retrieval microservice
+    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
+    validate_service \
+        "${ip_address}:7000/v1/retrieval" \
+        " " \
+        "retrieval" \
+        "retriever-redis-server" \
+        "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
+
+    # tei for rerank microservice
+    echo "validate tei..."
+    validate_service \
+        "${ip_address}:8808/rerank" \
+        '{"index":1,"score":' \
+        "tei-rerank" \
+        "tei-reranking-server" \
+        '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
+
+    # tgi for llm service
+    echo "validate tgi..."
+    validate_service \
+        "${ip_address}:${LLM_ENDPOINT_PORT}/v1/chat/completions" \
+        "content" \
+        "tgi-llm" \
+        "tgi-server" \
+        '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}'
+
+    # faqgen llm microservice
+    echo "validate llm-faqgen..."
+    validate_service \
+        "${ip_address}:${LLM_SERVER_PORT}/v1/faqgen" \
+        "text" \
+        "llm" \
+        "llm-faqgen-server" \
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+}
+
+function validate_megaservice() {
+    # Curl the Mega Service
+    validate_service \
+        "${ip_address}:${CHATQNA_BACKEND_PORT}/v1/chatqna" \
+        "Embed" \
+        "chatqna-megaservice" \
+        "chatqna-xeon-backend-server" \
+        '{"messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32}'
+
+    validate_service \
+        "${ip_address}:${CHATQNA_BACKEND_PORT}/v1/chatqna" \
+        "Embed" \
+        "chatqna-megaservice" \
+        "chatqna-xeon-backend-server" \
+        '{"messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens":32,"stream":false}'
+
+}
+
+function validate_frontend() {
+    cd $WORKPATH/ui/svelte
+    local conda_env_name="OPEA_e2e"
+    export PATH=${HOME}/miniforge3/bin/:$PATH
+    if conda info --envs | grep -q "$conda_env_name"; then
+        echo "$conda_env_name exist!"
+    else
+        conda create -n ${conda_env_name} python=3.12 -y
+    fi
+    source activate ${conda_env_name}
+
+    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+
+    conda install -c conda-forge nodejs=22.6.0 -y
+    npm install && npm ci && npx playwright install --with-deps
+    node -v && npm -v && pip list
+
+    exit_status=0
+    npx playwright test || exit_status=$?
+
+    if [ $exit_status -ne 0 ]; then
+        echo "[TEST INFO]: ---------frontend test failed---------"
+        exit $exit_status
+    else
+        echo "[TEST INFO]: ---------frontend test passed---------"
+    fi
+}
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/intel/cpu/xeon
+    docker compose -f compose_faqgen_tgi.yaml  down
+}
+
+function main() {
+
+    stop_docker
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_time=$(date +%s)
+    start_services
+    end_time=$(date +%s)
+    duration=$((end_time-start_time))
+    echo "Mega service start duration is $duration s" && sleep 1s
+
+    validate_microservices
+    validate_megaservice
+    validate_frontend
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
@@ -29,10 +29,12 @@ function build_docker_images() {
    fi
    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-    git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
+    git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork
+    VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)")
+    git checkout ${VLLM_VER} &> /dev/null && cd ../

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna-guardrails chatqna-ui dataprep retriever vllm-gaudi guardrails nginx"
+    service_list="chatqna chatqna-ui dataprep retriever vllm-gaudi guardrails nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
@@ -160,7 +162,7 @@ function validate_megaservice() {
    # Curl the Mega Service
    validate_service \
        "${ip_address}:8888/v1/chatqna" \
-        "data: " \
+        "Nike" \
        "mega-chatqna" \
        "chatqna-gaudi-guardrails-server" \
        '{"messages": "What is the revenue of Nike in 2023?"}'
--- a/ChatQnA/tests/test_compose_milvus_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_milvus_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -180,7 +181,7 @@ function validate_megaservice() {
    # Curl the Mega Service
    validate_service \
        "${ip_address}:8888/v1/chatqna" \
-        "data: " \
+        "Nike" \
        "chatqna-megaservice" \
        "chatqna-xeon-backend-server" \
        '{"messages": "What is the revenue of Nike in 2023?"}'
@@ -240,6 +241,8 @@ function main() {
    echo "==== microservices validated ===="
    validate_megaservice
    echo "==== megaservice validated ===="
+    validate_frontend
+    echo "==== frontend validated ===="

    stop_docker
    echo y | docker system prune
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
pre-commit-ci[bot]	99ffa4800e	[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci	2025-03-21 08:03:31 +00:00
WenjiaoYue	e2bd8f50af	update translation UI response format	2025-03-21 15:57:43 +08:00
Louie Tsai	0736912c69	change gaudi node exporter from default one to 41612 (#1702 ) Signed-off-by: Louie Tsai <louie.tsai@intel.com> Signed-off-by: Tsai, Louie <louie.tsai@intel.com>	2025-03-20 21:38:24 -07:00
Louie Tsai	e8f2313e07	Integrate docker images into compose yaml file to simplify the run instructions. fix ui ip issue and add web search tool support (#1656 ) Integrate docker images into compose yaml file to simplify the run instructions. fix ui ip issue and add web search tool support Signed-off-by: Tsai, Louie <louie.tsai@intel.com> Co-authored-by: alexsin368 <alex.sin@intel.com>	2025-03-21 09:42:20 +08:00
XinyaoWa	6d24c1c77a	Merge FaqGen into ChatQnA (#1654 ) 1. Delete FaqGen 2. Refactor FaqGen into ChatQnA, serve as a LLM selection. 3. Combine all ChatQnA related Dockerfile into one Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>	2025-03-20 17:40:00 +08:00
Zhu Yongbo	5a50ae0471	Add new UI/new features for EC-RAG (#1665 ) Signed-off-by: Zhu, Yongbo <yongbo.zhu@intel.com>	2025-03-20 10:46:01 +08:00
minmin-intel	fecc22719a	fix errors for running AgentQnA on xeon with openai and update readme (#1664 ) Signed-off-by: minmin-intel <minmin.hou@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>	2025-03-20 09:57:18 +08:00
chen, suyue	2204fe8e36	Enable base image build in CI/CD (#1669 ) Signed-off-by: chensuyue <suyue.chen@intel.com>	2025-03-19 09:21:51 +08:00
ZePan110	b50dd8f47a	Fix workflow issues. (#1691 ) Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-03-19 09:21:27 +08:00
Spycsh	bf8d03425c	Set vLLM as default model for VisualQnA (#1644 )	2025-03-18 15:29:49 +08:00
chen, suyue	1b6342aa5b	Fix input issue for manual-image-build.yml (#1666 ) Signed-off-by: chensuyue <suyue.chen@intel.com>	2025-03-17 13:11:53 +08:00
James Edwards	527b146a80	Add final README.md and set_env.sh script for quickstart review. Previous pull request was 1595. (#1662 ) Signed-off-by: Edwards, James A <jaedwards@habana.ai> Co-authored-by: Edwards, James A <jaedwards@habana.ai>	2025-03-14 16:05:01 -07:00
Sun, Xuehao	7159ce3731	Update stale issue and PR settings to 30 days for inactivity (#1661 ) Signed-off-by: Sun, Xuehao <xuehao.sun@intel.com>	2025-03-14 17:55:49 +08:00
Louie Tsai	671dff7f51	[ChatQnA] Enable Prometheus and Grafana with telemetry docker compose file. (#1623 ) Signed-off-by: Tsai, Louie <louie.tsai@intel.com>	2025-03-13 23:18:29 -07:00
Wang, Kai Lawrence	8fe19291c8	[AudioQnA] Enable vLLM and set it as default LLM serving (#1657 ) Signed-off-by: Wang, Kai Lawrence <kai.lawrence.wang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>	2025-03-14 09:56:33 +08:00
CharleneHu-42	35c5cf5de8	Refine README with highlighted examples and updated support info (#1006 ) Signed-off-by: CharleneHu-42 <yabai.hu@intel.com> Co-authored-by: Yi Yao <yi.a.yao@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ying Hu <ying.hu@intel.com>	2025-03-13 13:50:28 +08:00
ZePan110	63b789ae91	Enable Gaudi3, Rocm and Arc on manually release test. (#1615 ) 1. Enable Gaudi3, Rocm and Arc on manually release test. 2. Fix the issue that manual workflow can't be canceled. Signed-off-by: ZePan110 <ze.pan@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>	2025-03-13 13:38:53 +08:00
ZePan110	d670dbf0aa	Enable GraphRAG and ProductivitySuite model cache for docker compose test. (#1608 ) Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-03-13 11:23:03 +08:00
Li Gang	0701b8cfff	[ChatQnA][docker]Check healthy of redis to avoid dataprep failure (#1591 ) Signed-off-by: Li Gang <gang.g.li@intel.com>	2025-03-13 10:52:33 +08:00
xiguiw	effa2a28cf	Enable CodeGen vLLM (#1636 ) Signed-off-by: Wang, Xigui <xigui.wang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>	2025-03-13 10:38:47 +08:00
ZePan110	adcd113f53	Enable inject_commit to docker image feature. (#1653 ) Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-03-13 09:39:42 +08:00
Eero Tamminen	4269669f73	Use GenAIComp base image to simplify Dockerfiles & reduce image sizes - part 2 (#1638 ) Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>	2025-03-13 08:23:07 +08:00
Sun, Xuehao	12657ac945	Add GitHub Action to check and close stale issues and PRs (#1646 ) Signed-off-by: Sun, Xuehao <xuehao.sun@intel.com>	2025-03-12 10:56:07 +08:00
chen, suyue	43d0a18270	Enhance ChatQnA test scripts (#1643 ) Signed-off-by: chensuyue <suyue.chen@intel.com>	2025-03-10 17:36:26 +08:00
Wang, Kai Lawrence	5362321d3a	Fix vllm model cache directory (#1642 ) Signed-off-by: Wang, Kai Lawrence <kai.lawrence.wang@intel.com>	2025-03-10 13:40:42 +08:00
XinyaoWa	eb245fd085	Set vLLM as default model for FaqGen (#1580 ) Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>	2025-03-10 09:39:35 +08:00
chen, suyue	4cab86260f	Use the latest HabanaAI/vllm-fork release tag to build vllm-gaudi image (#1635 ) Signed-off-by: chensuyue <suyue.chen@intel.com> Co-authored-by: Liang Lv <liang1.lv@intel.com>	2025-03-07 20:40:32 +08:00
wangleflex	694207f76b	[ChatQnA] Show spinner after query to improve user experience (#1003 ) (#1628 ) Signed-off-by: Wang,Le3 <le3.wang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>	2025-03-07 17:08:53 +08:00