Merge branch 'main' into ft

Fix corner CI issue when the example path deleted (#1634 )
Signed-off-by: chensuyue <suyue.chen@intel.com>
2025-03-07 15:05:27 +08:00 · 2025-03-07 15:05:08 +08:00 · 2025-03-07 14:31:34 +08:00 · 2025-03-07 13:13:29 +08:00 · 2025-03-07 11:00:48 +08:00 · 2025-03-07 10:56:21 +08:00
195 changed files with 30971 additions and 1247 deletions
--- a/.github/workflows/_example-workflow.yml
+++ b/.github/workflows/_example-workflow.yml
@@ -43,7 +43,11 @@ on:
      inject_commit:
        default: false
        required: false
-        type: string
+        type: boolean
+      use_model_cache:
+        default: false
+        required: false
+        type: boolean

 jobs:
 ####################################################################################################
@@ -74,12 +78,16 @@ jobs:
          cd ${{ github.workspace }}/${{ inputs.example }}/docker_image_build
          docker_compose_path=${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml
          if [[ $(grep -c "vllm:" ${docker_compose_path}) != 0 ]]; then
-              git clone --depth 1 https://github.com/vllm-project/vllm.git
-              cd vllm && git rev-parse HEAD && cd ../
+              git clone https://github.com/vllm-project/vllm.git  && cd vllm
+              # Get the latest tag
+              VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+              echo "Check out vLLM tag ${VLLM_VER}"
+              git checkout ${VLLM_VER} &> /dev/null
+              # make sure do not change the pwd
+              git rev-parse HEAD && cd ../
          fi
          if [[ $(grep -c "vllm-gaudi:" ${docker_compose_path}) != 0 ]]; then
              git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
-              sed -i 's/triton/triton==3.1.0/g' vllm-fork/requirements-hpu.txt
          fi
          git clone --depth 1 --branch ${{ inputs.opea_branch }} https://github.com/opea-project/GenAIComps.git
          cd GenAIComps && git rev-parse HEAD && cd ../
@@ -106,6 +114,7 @@ jobs:
      tag: ${{ inputs.tag }}
      example: ${{ inputs.example }}
      hardware: ${{ inputs.node }}
+      use_model_cache: ${{ inputs.use_model_cache }}
    secrets: inherit


@@ -127,7 +136,7 @@ jobs:
 ####################################################################################################
  test-gmc-pipeline:
    needs: [build-images]
-    if: ${{ fromJSON(inputs.test_gmc) }}
+    if: false # ${{ fromJSON(inputs.test_gmc) }}
    uses: ./.github/workflows/_gmc-e2e.yml
    with:
      example: ${{ inputs.example }}
--- a/.github/workflows/_helm-e2e.yml
+++ b/.github/workflows/_helm-e2e.yml
@@ -97,6 +97,7 @@ jobs:

  helm-test:
    needs: [get-test-case]
+    if: ${{ needs.get-test-case.outputs.value_files != '[]' }}
    strategy:
      matrix:
        value_file: ${{ fromJSON(needs.get-test-case.outputs.value_files) }}
--- a/.github/workflows/_run-docker-compose.yml
+++ b/.github/workflows/_run-docker-compose.yml
@@ -28,6 +28,10 @@ on:
        required: false
        type: string
        default: ""
+      use_model_cache:
+        required: false
+        type: boolean
+        default: false
 jobs:
  get-test-case:
    runs-on: ubuntu-latest
@@ -85,12 +89,17 @@ jobs:
              fi
          done

+          if [ -z "$run_test_cases" ] && [[ $(printf '%s\n' "${changed_files[@]}" | grep ${{ inputs.example }} | grep /tests/) ]]; then
+              run_test_cases=$other_test_cases
+          fi
+
          test_cases=$(echo $run_test_cases | tr ' ' '\n' | sort -u | jq -R '.' | jq -sc '.')
          echo "test_cases=$test_cases"
          echo "test_cases=$test_cases" >> $GITHUB_OUTPUT

  compose-test:
    needs: [get-test-case]
+    if: ${{ needs.get-test-case.outputs.test_cases != '' }}
    strategy:
      matrix:
        test_case: ${{ fromJSON(needs.get-test-case.outputs.test_cases) }}
@@ -126,6 +135,7 @@ jobs:
        shell: bash
        env:
          HUGGINGFACEHUB_API_TOKEN: ${{ secrets.HUGGINGFACEHUB_API_TOKEN }}
+          HF_TOKEN: ${{ secrets.HUGGINGFACEHUB_API_TOKEN }}
          GOOGLE_CSE_ID: ${{ secrets.GOOGLE_CSE_ID }}
          GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
          PINECONE_KEY: ${{ secrets.PINECONE_KEY }}
@@ -138,9 +148,18 @@ jobs:
          example: ${{ inputs.example }}
          hardware: ${{ inputs.hardware }}
          test_case: ${{ matrix.test_case }}
+          use_model_cache: ${{ inputs.use_model_cache }}
        run: |
          cd ${{ github.workspace }}/$example/tests
          if [[ "$IMAGE_REPO" == "" ]]; then export IMAGE_REPO="${OPEA_IMAGE_REPO}opea"; fi
+          if [[ "$use_model_cache" == "true" ]]; then
+            if [ -d "/data2/hf_model" ]; then
+              export model_cache="/data2/hf_model"
+            else
+              echo "Model cache directory /data2/hf_model does not exist"
+              export model_cache="~/.cache/huggingface/hub"
+            fi
+          fi
          if [ -f ${test_case} ]; then timeout 30m bash ${test_case}; else echo "Test script {${test_case}} not found, skip test!"; fi

      - name: Clean up container after test
--- a/.github/workflows/manual-docker-publish.yml
+++ b/.github/workflows/manual-docker-publish.yml
@@ -41,9 +41,11 @@ jobs:

  publish:
    needs: [get-image-list]
+    if: ${{ needs.get-image-list.outputs.matrix != '' }}
    strategy:
      matrix:
        image: ${{ fromJSON(needs.get-image-list.outputs.matrix) }}
+      fail-fast: false
    runs-on: "docker-build-${{ inputs.node }}"
    steps:
      - uses: docker/login-action@v3.2.0
--- a/.github/workflows/manual-docker-scan.yml
+++ b/.github/workflows/manual-docker-scan.yml
@@ -47,6 +47,7 @@ jobs:
  scan-docker:
    needs: get-image-list
    runs-on: "docker-build-${{ inputs.node }}"
+    if: ${{ needs.get-image-list.outputs.matrix != '' }}
    strategy:
      matrix:
        image: ${{ fromJson(needs.get-image-list.outputs.matrix) }}
--- a/.github/workflows/manual-example-workflow.yml
+++ b/.github/workflows/manual-example-workflow.yml
@@ -20,11 +20,11 @@ on:
        description: "Tag to apply to images"
        required: true
        type: string
-      deploy_gmc:
-        default: false
-        description: 'Whether to deploy gmc'
-        required: true
-        type: boolean
+      # deploy_gmc:
+      #   default: false
+      #   description: 'Whether to deploy gmc'
+      #   required: true
+      #   type: boolean
      build:
        default: true
        description: 'Build test required images for Examples'
@@ -40,11 +40,11 @@ on:
        description: 'Test examples with helm charts'
        required: false
        type: boolean
-      test_gmc:
-        default: false
-        description: 'Test examples with gmc'
-        required: false
-        type: boolean
+      # test_gmc:
+      #   default: false
+      #   description: 'Test examples with gmc'
+      #   required: false
+      #   type: boolean
      opea_branch:
        default: "main"
        description: 'OPEA branch for image build'
@@ -54,7 +54,12 @@ on:
        default: false
        description: "inject commit to docker images true or false"
        required: false
-        type: string
+        type: boolean
+      use_model_cache:
+        default: false
+        description: "use model cache true or false"
+        required: false
+        type: boolean

 permissions: read-all
 jobs:
@@ -76,7 +81,8 @@ jobs:

  build-deploy-gmc:
    needs: [get-test-matrix]
-    if: ${{ fromJSON(inputs.deploy_gmc) }}
+    if: false
+    #${{ fromJSON(inputs.deploy_gmc) }}
    strategy:
      matrix:
        node: ${{ fromJson(needs.get-test-matrix.outputs.nodes) }}
@@ -89,7 +95,7 @@ jobs:
    secrets: inherit

  run-examples:
-    needs: [get-test-matrix, build-deploy-gmc]
+    needs: [get-test-matrix]   #[get-test-matrix, build-deploy-gmc]
    if: always()
    strategy:
      matrix:
@@ -104,7 +110,8 @@ jobs:
      build: ${{ fromJSON(inputs.build) }}
      test_compose: ${{ fromJSON(inputs.test_compose) }}
      test_helmchart: ${{ fromJSON(inputs.test_helmchart) }}
-      test_gmc: ${{ fromJSON(inputs.test_gmc) }}
+      # test_gmc: ${{ fromJSON(inputs.test_gmc) }}
      opea_branch: ${{ inputs.opea_branch }}
      inject_commit: ${{ inputs.inject_commit }}
+      use_model_cache: ${{ inputs.use_model_cache }}
    secrets: inherit
--- a/.github/workflows/manual-freeze-tag.yml
+++ b/.github/workflows/manual-freeze-tag.yml
@@ -25,9 +25,9 @@ jobs:

      - name: Set up Git
        run: |
-          git config --global user.name "NeuralChatBot"
-          git config --global user.email "grp_neural_chat_bot@intel.com"
-          git remote set-url origin https://NeuralChatBot:"${{ secrets.ACTION_TOKEN }}"@github.com/opea-project/GenAIExamples.git
+          git config --global user.name "CICD-at-OPEA"
+          git config --global user.email "CICD@opea.dev"
+          git remote set-url origin https://CICD-at-OPEA:"${{ secrets.ACTION_TOKEN }}"@github.com/opea-project/GenAIExamples.git

      - name: Run script
        run: |
--- a/.github/workflows/manual-image-build.yml
+++ b/.github/workflows/manual-image-build.yml
@@ -51,6 +51,7 @@ jobs:

  image-build:
    needs: get-test-matrix
+    if: ${{ needs.get-test-matrix.outputs.nodes != '' }}
    strategy:
      matrix:
        node: ${{ fromJson(needs.get-test-matrix.outputs.nodes) }}
--- a/.github/workflows/manual-reset-local-registry.yml
+++ b/.github/workflows/manual-reset-local-registry.yml
@@ -33,6 +33,7 @@ jobs:

  clean-up:
    needs: get-build-matrix
+    if: ${{ needs.get-image-list.outputs.matrix != '' }}
    strategy:
      matrix:
        node: ${{ fromJson(needs.get-build-matrix.outputs.nodes) }}
@@ -47,6 +48,7 @@ jobs:

  build:
    needs: [get-build-matrix, clean-up]
+    if: ${{ needs.get-image-list.outputs.matrix != '' }}
    strategy:
      matrix:
        example: ${{ fromJson(needs.get-build-matrix.outputs.examples) }}
--- a/.github/workflows/nightly-docker-build-publish.yml
+++ b/.github/workflows/nightly-docker-build-publish.yml
@@ -34,6 +34,7 @@ jobs:

  build-and-test:
    needs: get-build-matrix
+    if: ${{ needs.get-build-matrix.outputs.examples_json != '' }}
    strategy:
      matrix:
        example: ${{ fromJSON(needs.get-build-matrix.outputs.examples_json) }}
@@ -53,9 +54,11 @@ jobs:

  publish:
    needs: [get-build-matrix, get-image-list, build-and-test]
+    if: ${{ needs.get-image-list.outputs.matrix != '' }}
    strategy:
      matrix:
        image: ${{ fromJSON(needs.get-image-list.outputs.matrix) }}
+      fail-fast: false
    runs-on: "docker-build-gaudi"
    steps:
      - uses: docker/login-action@v3.2.0
--- a/.github/workflows/pr-chart-e2e.yml
+++ b/.github/workflows/pr-chart-e2e.yml
@@ -65,7 +65,7 @@ jobs:

  helm-chart-test:
    needs: [job1]
-    if: always() && ${{ needs.job1.outputs.run_matrix.example.length > 0 }}
+    if: always() && ${{ fromJSON(needs.job1.outputs.run_matrix).length != 0 }}
    uses: ./.github/workflows/_helm-e2e.yml
    strategy:
      matrix: ${{ fromJSON(needs.job1.outputs.run_matrix) }}
--- a/.github/workflows/pr-docker-compose-e2e.yml
+++ b/.github/workflows/pr-docker-compose-e2e.yml
@@ -32,15 +32,16 @@ jobs:

  example-test:
    needs: [get-test-matrix]
+    if: ${{ needs.get-test-matrix.outputs.run_matrix != '' }}
    strategy:
      matrix: ${{ fromJSON(needs.get-test-matrix.outputs.run_matrix) }}
      fail-fast: false
-    if: ${{ !github.event.pull_request.draft }}
    uses: ./.github/workflows/_run-docker-compose.yml
    with:
      registry: "opea"
      tag: "ci"
      example: ${{ matrix.example }}
      hardware: ${{ matrix.hardware }}
+      use_model_cache: true
      diff_excluded_files: '\.github|\.md|\.txt|kubernetes|gmc|assets|benchmark'
    secrets: inherit
--- a/.github/workflows/push-image-build.yml
+++ b/.github/workflows/push-image-build.yml
@@ -24,6 +24,7 @@ jobs:

  image-build:
    needs: job1
+    if: ${{ needs.job1.outputs.run_matrix != '{"include":[]}' }}
    strategy:
      matrix: ${{ fromJSON(needs.job1.outputs.run_matrix) }}
      fail-fast: false
--- a/.github/workflows/scripts/get_test_matrix.sh
+++ b/.github/workflows/scripts/get_test_matrix.sh
@@ -12,6 +12,7 @@ run_matrix="{\"include\":["

 examples=$(printf '%s\n' "${changed_files[@]}" | grep '/' | cut -d'/' -f1 | sort -u)
 for example in ${examples}; do
+    if [[ ! -d $WORKSPACE/$example ]]; then continue; fi
    cd $WORKSPACE/$example
    if [[ ! $(find . -type f | grep ${test_mode}) ]]; then continue; fi
    cd tests
@@ -26,7 +27,10 @@ for example in ${examples}; do

    run_hardware=""
    if [[ $(printf '%s\n' "${changed_files[@]}" | grep ${example} | cut -d'/' -f2 | grep -E '\.py|Dockerfile*|ui|docker_image_build' ) ]]; then
-        # run test on all hardware if megaservice or ui code change
+        echo "run test on all hardware if megaservice or ui code change..."
+        run_hardware=$hardware_list
+    elif [[ $(printf '%s\n' "${changed_files[@]}" | grep ${example} | grep 'tests'| cut -d'/' -f3 | grep -vE '^test_|^_test' ) ]]; then
+        echo "run test on all hardware if common test scripts change..."
        run_hardware=$hardware_list
    else
        for hardware in ${hardware_list}; do
--- a/.github/workflows/weekly-update-images.yml
+++ b/.github/workflows/weekly-update-images.yml
@@ -16,8 +16,8 @@ jobs:
  freeze-images:
    runs-on: ubuntu-latest
    env:
-      USER_NAME: "NeuralChatBot"
-      USER_EMAIL: "grp_neural_chat_bot@intel.com"
+      USER_NAME: "CICD-at-OPEA"
+      USER_EMAIL: "CICD@opea.dev"
      BRANCH_NAME: "update_images_tag"
    steps:
      - name: Checkout repository
--- a/AgentQnA/README.md
+++ b/AgentQnA/README.md
@@ -84,7 +84,7 @@ flowchart LR
 3. Hierarchical multi-agents can improve performance.
   Expert worker agents, such as RAG agent and SQL agent, can provide high-quality output for different aspects of a complex query, and the supervisor agent can aggregate the information together to provide a comprehensive answer. If we only use one agent and provide all the tools to this single agent, it may get overwhelmed and not able to provide accurate answers.

-## Deployment with docker
+## Deploy with docker

 1. Build agent docker image [Optional]

@@ -217,13 +217,19 @@ docker build -t opea/agent:latest --build-arg https_proxy=$https_proxy --build-a
   :::
   ::::

+## Deploy AgentQnA UI
+
+The AgentQnA UI can be deployed locally or using Docker.
+
+For detailed instructions on deploying AgentQnA UI, refer to the [AgentQnA UI Guide](./ui/svelte/README.md).
+
 ## Deploy using Helm Chart

 Refer to the [AgentQnA helm chart](./kubernetes/helm/README.md) for instructions on deploying AgentQnA on Kubernetes.

 ## Validate services

-First look at logs of the agent docker containers:
+1. First look at logs of the agent docker containers:

 ```
 # worker RAG agent
@@ -240,35 +246,18 @@ docker logs react-agent-endpoint

 You should see something like "HTTP server setup successful" if the docker containers are started successfully.</p>

-Second, validate worker RAG agent:
+2. You can use python to validate the agent system

+```bash
+# RAG worker agent
+python tests/test.py --prompt "Tell me about Michael Jackson song Thriller" --agent_role "worker" --ext_port 9095
+
+# SQL agent
+python tests/test.py --prompt "How many employees in company" --agent_role "worker" --ext_port 9096
+
+# supervisor agent: this will test a two-turn conversation
+python tests/test.py --agent_role "supervisor" --ext_port 9090
 ```
-curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "Michael Jackson song Thriller"
-    }'
-```
-
-Third, validate worker SQL agent:
-
-```
-curl http://${host_ip}:9096/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "How many employees are in the company"
-    }'
-```
-
-Finally, validate supervisor agent:
-
-```
-curl http://${host_ip}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "How many albums does Iron Maiden have?"
-    }'
-```
-
-## Deploy AgentQnA UI
-
-The AgentQnA UI can be deployed locally or using Docker.
-
-For detailed instructions on deploying AgentQnA UI, refer to the [AgentQnA UI Guide](./ui/svelte/README.md).

 ## How to register your own tools with agent

--- a/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AgentQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -8,7 +8,7 @@ services:
    ports:
      - "${AGENTQNA_TGI_SERVICE_PORT-8085}:80"
    volumes:
-      - /var/opea/agent-service/:/data
+      - ${HF_CACHE_DIR:-/var/opea/agent-service/}:/data
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
--- a/AgentQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/README.md
@@ -60,7 +60,7 @@ This example showcases a hierarchical multi-agent system for question-answering
   ```
 6. Launch multi-agent system

-   The configurations of the supervisor agent and the worker agents are defined in the docker-compose yaml file. We currently use openAI GPT-4o-mini as LLM.
+   The configurations of the supervisor agent and the worker agents are defined in the docker-compose yaml file. We currently use OpenAI GPT-4o-mini as LLM.

   ```
   cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon
--- a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
@@ -13,6 +13,7 @@ services:
    environment:
      ip_address: ${ip_address}
      strategy: rag_agent
+      with_memory: false
      recursion_limit: ${recursion_limit_worker}
      llm_engine: openai
      OPENAI_API_KEY: ${OPENAI_API_KEY}
@@ -35,17 +36,17 @@ services:
    image: opea/agent:latest
    container_name: sql-agent-endpoint
    volumes:
-      - ${WORKDIR}/TAG-Bench/:/home/user/TAG-Bench # SQL database
+      - ${WORKDIR}/GenAIExamples/AgentQnA/tests:/home/user/chinook-db # SQL database
    ports:
      - "9096:9096"
    ipc: host
    environment:
      ip_address: ${ip_address}
      strategy: sql_agent
+      with_memory: false
      db_name: ${db_name}
      db_path: ${db_path}
      use_hints: false
-      hints_file: /home/user/TAG-Bench/${db_name}_hints.csv
      recursion_limit: ${recursion_limit_worker}
      llm_engine: openai
      OPENAI_API_KEY: ${OPENAI_API_KEY}
@@ -64,6 +65,7 @@ services:
    container_name: react-agent-endpoint
    depends_on:
      - worker-rag-agent
+      - worker-sql-agent
    volumes:
      - ${TOOLSET_PATH}:/home/user/tools/
    ports:
@@ -71,14 +73,15 @@ services:
    ipc: host
    environment:
      ip_address: ${ip_address}
-      strategy: react_langgraph
+      strategy: react_llama
+      with_memory: true
      recursion_limit: ${recursion_limit_supervisor}
      llm_engine: openai
      OPENAI_API_KEY: ${OPENAI_API_KEY}
      model: ${model}
      temperature: ${temperature}
      max_new_tokens: ${max_new_tokens}
-      stream: false
+      stream: true
      tools: /home/user/tools/supervisor_agent_tools.yaml
      require_human_feedback: false
      no_proxy: ${no_proxy}
--- a/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh
@@ -16,7 +16,7 @@ export WORKER_AGENT_URL="http://${ip_address}:9095/v1/chat/completions"
 export SQL_AGENT_URL="http://${ip_address}:9096/v1/chat/completions"
 export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool"
 export CRAG_SERVER=http://${ip_address}:8080
-export db_name=california_schools
-export db_path="sqlite:////home/user/TAG-Bench/dev_folder/dev_databases/${db_name}/${db_name}.sqlite"
+export db_name=Chinook
+export db_path="sqlite:////home/user/chinook-db/Chinook_Sqlite.sqlite"

 docker compose -f compose_openai.yaml up -d
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -13,6 +13,7 @@ services:
    environment:
      ip_address: ${ip_address}
      strategy: rag_agent_llama
+      with_memory: false
      recursion_limit: ${recursion_limit_worker}
      llm_engine: vllm
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
@@ -43,6 +44,7 @@ services:
    environment:
      ip_address: ${ip_address}
      strategy: sql_agent_llama
+      with_memory: false
      db_name: ${db_name}
      db_path: ${db_path}
      use_hints: false
@@ -74,6 +76,7 @@ services:
    environment:
      ip_address: ${ip_address}
      strategy: react_llama
+      with_memory: true
      recursion_limit: ${recursion_limit_supervisor}
      llm_engine: vllm
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
@@ -81,7 +84,7 @@ services:
      model: ${LLM_MODEL_ID}
      temperature: ${temperature}
      max_new_tokens: ${max_new_tokens}
-      stream: false
+      stream: true
      tools: /home/user/tools/supervisor_agent_tools.yaml
      require_human_feedback: false
      no_proxy: ${no_proxy}
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_gaudi.sh
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_gaudi.sh
@@ -14,7 +14,7 @@ export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export HF_CACHE_DIR=${HF_CACHE_DIR}
 ls $HF_CACHE_DIR
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-70B-Instruct"
+export LLM_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct" #"meta-llama/Meta-Llama-3.1-70B-Instruct"
 export NUM_SHARDS=4
 export LLM_ENDPOINT_URL="http://${ip_address}:8086"
 export temperature=0
--- a/AgentQnA/tests/step1_build_images.sh
+++ b/AgentQnA/tests/step1_build_images.sh
@@ -43,7 +43,6 @@ function build_vllm_docker_image() {
    fi
    cd ./vllm-fork
    git checkout v0.6.4.post2+Gaudi-1.19.0
-    sed -i 's/triton/triton==3.1.0/g' requirements-hpu.txt
    docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:ci --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
    if [ $? -ne 0 ]; then
        echo "opea/vllm-gaudi:ci failed"
--- a/AgentQnA/tests/step2_start_retrieval_tool.sh
+++ b/AgentQnA/tests/step2_start_retrieval_tool.sh
@@ -9,7 +9,7 @@ echo "WORKDIR=${WORKDIR}"
 export ip_address=$(hostname -I | awk '{print $1}')
 export host_ip=${ip_address}

-export HF_CACHE_DIR=$WORKDIR/hf_cache
+export HF_CACHE_DIR=${model_cache:-"$WORKDIR/hf_cache"}
 if [ ! -d "$HF_CACHE_DIR" ]; then
    echo "Creating HF_CACHE directory"
    mkdir -p "$HF_CACHE_DIR"
--- a/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
+++ b/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
@@ -11,9 +11,9 @@ export ip_address=$(hostname -I | awk '{print $1}')
 export TOOLSET_PATH=$WORKPATH/tools/
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-model="meta-llama/Meta-Llama-3.1-70B-Instruct"
+model="meta-llama/Llama-3.3-70B-Instruct" #"meta-llama/Meta-Llama-3.1-70B-Instruct"

-export HF_CACHE_DIR=/data2/huggingface
+export HF_CACHE_DIR=${model_cache:-"/data2/huggingface"}
 if [ ! -d "$HF_CACHE_DIR" ]; then
    HF_CACHE_DIR=$WORKDIR/hf_cache
    mkdir -p "$HF_CACHE_DIR"
@@ -60,23 +60,6 @@ function start_vllm_service_70B() {
    echo "Service started successfully"
 }

-
-function prepare_data() {
-    cd $WORKDIR
-
-    echo "Downloading data..."
-    git clone https://github.com/TAG-Research/TAG-Bench.git
-    cd TAG-Bench/setup
-    chmod +x get_dbs.sh
-    ./get_dbs.sh
-
-    echo "Split data..."
-    cd $WORKPATH/tests/sql_agent_test
-    bash run_data_split.sh
-
-    echo "Data preparation done!"
-}
-
 function download_chinook_data(){
    echo "Downloading chinook data..."
    cd $WORKDIR
@@ -113,7 +96,7 @@ function validate_agent_service() {
    echo "======================Testing worker rag agent======================"
    export agent_port="9095"
    prompt="Tell me about Michael Jackson song Thriller"
-    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt")
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ext_port $agent_port)
    # echo $CONTENT
    local EXIT_CODE=$(validate "$CONTENT" "Thriller" "rag-agent-endpoint")
    echo $EXIT_CODE
@@ -127,7 +110,7 @@ function validate_agent_service() {
    echo "======================Testing worker sql agent======================"
    export agent_port="9096"
    prompt="How many employees are there in the company?"
-    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt")
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ext_port $agent_port)
    local EXIT_CODE=$(validate "$CONTENT" "8" "sql-agent-endpoint")
    echo $CONTENT
    # echo $EXIT_CODE
@@ -140,9 +123,8 @@ function validate_agent_service() {
    # test supervisor react agent
    echo "======================Testing supervisor react agent======================"
    export agent_port="9090"
-    prompt="How many albums does Iron Maiden have?"
-    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt")
-    local EXIT_CODE=$(validate "$CONTENT" "21" "react-agent-endpoint")
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --agent_role "supervisor" --ext_port $agent_port --stream)
+    local EXIT_CODE=$(validate "$CONTENT" "Iron" "react-agent-endpoint")
    # echo $CONTENT
    echo $EXIT_CODE
    local EXIT_CODE="${EXIT_CODE:0-1}"
@@ -153,15 +135,6 @@ function validate_agent_service() {

 }

-function remove_data() {
-    echo "Removing data..."
-    cd $WORKDIR
-    if [ -d "TAG-Bench" ]; then
-        rm -rf TAG-Bench
-    fi
-    echo "Data removed!"
-}
-
 function remove_chinook_data(){
    echo "Removing chinook data..."
    cd $WORKDIR
@@ -189,8 +162,9 @@ function main() {
    echo "==================== Agent service validated ===================="
 }

-remove_data
+
 remove_chinook_data
+
 main
-remove_data
+
 remove_chinook_data
--- a/AgentQnA/tests/step4a_launch_and_validate_agent_tgi_on_rocm.sh
+++ b/AgentQnA/tests/step4a_launch_and_validate_agent_tgi_on_rocm.sh
@@ -11,7 +11,7 @@ export ip_address=$(hostname -I | awk '{print $1}')
 export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}

-export HF_CACHE_DIR=$WORKDIR/hf_cache
+export HF_CACHE_DIR=${model_cache:-"$WORKDIR/hf_cache"}
 if [ ! -d "$HF_CACHE_DIR" ]; then
    mkdir -p "$HF_CACHE_DIR"
 fi
--- a/AgentQnA/tests/test.py
+++ b/AgentQnA/tests/test.py
@@ -1,34 +1,20 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

 import argparse
-import os
+import json
+import uuid

 import requests


-def generate_answer_agent_api(url, prompt):
-    proxies = {"http": ""}
-    payload = {
-        "messages": prompt,
-    }
-    response = requests.post(url, json=payload, proxies=proxies)
-    answer = response.json()["text"]
-    return answer
-
-
 def process_request(url, query, is_stream=False):
    proxies = {"http": ""}
-
-    payload = {
-        "messages": query,
-    }
-
+    content = json.dumps(query) if query is not None else None
    try:
-        resp = requests.post(url=url, json=payload, proxies=proxies, stream=is_stream)
+        resp = requests.post(url=url, data=content, proxies=proxies, stream=is_stream)
        if not is_stream:
            ret = resp.json()["text"]
-            print(ret)
        else:
            for line in resp.iter_lines(decode_unicode=True):
                print(line)
@@ -38,19 +24,54 @@ def process_request(url, query, is_stream=False):
        return ret
    except requests.exceptions.RequestException as e:
        ret = f"An error occurred:{e}"
-        print(ret)
-        return False
+        return None
+
+
+def test_worker_agent(args):
+    url = f"http://{args.ip_addr}:{args.ext_port}/v1/chat/completions"
+    query = {"role": "user", "messages": args.prompt, "stream": "false"}
+    ret = process_request(url, query)
+    print("Response: ", ret)
+
+
+def add_message_and_run(url, user_message, thread_id, stream=False):
+    print("User message: ", user_message)
+    query = {"role": "user", "messages": user_message, "thread_id": thread_id, "stream": stream}
+    ret = process_request(url, query, is_stream=stream)
+    print("Response: ", ret)
+
+
+def test_chat_completion_multi_turn(args):
+    url = f"http://{args.ip_addr}:{args.ext_port}/v1/chat/completions"
+    thread_id = f"{uuid.uuid4()}"
+
+    # first turn
+    print("===============First turn==================")
+    user_message = "Which artist has the most albums in the database?"
+    add_message_and_run(url, user_message, thread_id, stream=args.stream)
+    print("===============End of first turn==================")
+
+    # second turn
+    print("===============Second turn==================")
+    user_message = "Give me a few examples of the artist's albums?"
+    add_message_and_run(url, user_message, thread_id, stream=args.stream)
+    print("===============End of second turn==================")


 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--prompt", type=str)
-    parser.add_argument("--stream", action="store_true")
-    args = parser.parse_args()
+    parser.add_argument("--ip_addr", type=str, default="127.0.0.1", help="endpoint ip address")
+    parser.add_argument("--ext_port", type=str, default="9090", help="endpoint port")
+    parser.add_argument("--stream", action="store_true", help="streaming mode")
+    parser.add_argument("--prompt", type=str, help="prompt message")
+    parser.add_argument("--agent_role", type=str, default="supervisor", help="supervisor or worker")
+    args, _ = parser.parse_known_args()

-    ip_address = os.getenv("ip_address", "localhost")
-    agent_port = os.getenv("agent_port", "9090")
-    url = f"http://{ip_address}:{agent_port}/v1/chat/completions"
-    prompt = args.prompt
+    print(args)

-    process_request(url, prompt, args.stream)
+    if args.agent_role == "supervisor":
+        test_chat_completion_multi_turn(args)
+    elif args.agent_role == "worker":
+        test_worker_agent(args)
+    else:
+        raise ValueError("Invalid agent role")
--- a/AgentQnA/tests/test_compose_on_gaudi.sh
+++ b/AgentQnA/tests/test_compose_on_gaudi.sh
@@ -78,7 +78,7 @@ bash step3_ingest_data_and_validate_retrieval.sh
 echo "=================== #3 Data ingestion and validation completed===================="

 echo "=================== #4 Start agent and API server===================="
-bash step4_launch_and_validate_agent_tgi.sh
+bash step4_launch_and_validate_agent_gaudi.sh
 echo "=================== #4 Agent test passed ===================="

 echo "=================== #5 Stop agent and API server===================="
--- a/AgentQnA/ui/svelte/README.md
+++ b/AgentQnA/ui/svelte/README.md
@@ -18,7 +18,7 @@ Here're some of the project's features:
 2. cd command to the current folder.

   ```
-   cd AgentQnA/ui
+   cd AgentQnA/ui/svelte
   ```

 3. Modify the required .env variables.
@@ -41,7 +41,7 @@ Here're some of the project's features:
  npm run dev
  ```

- The application will be available at `http://localhost:3000`.
+- The application will be available at `http://localhost:5173`.

 5. **For Docker Setup:**

@@ -54,7 +54,7 @@ Here're some of the project's features:
 - Run the Docker container:

  ```
-  docker run -d -p 3000:3000 --name agent-ui opea:agent-ui
+  docker run -d -p 5173:5173 --name agent-ui opea:agent-ui
  ```

- The application will be available at `http://localhost:3000`.
+- The application will be available at `http://localhost:5173`.
--- a/AgentQnA/ui/svelte/src/lib/components/home.svelte
+++ b/AgentQnA/ui/svelte/src/lib/components/home.svelte
@@ -108,7 +108,7 @@
 			<!-- svelte-ignore a11y-click-events-have-key-events -->
 			<div
 				class="relative rounded-xl bg-white p-2 py-8 pl-16"
-				on:click={() => handleCreate(feature)}
+				on:click={() => handleCreate(feature.description)}
 			>
 				<dt class="text-base font-semibold text-gray-900">
 					<div
--- a/AudioQnA/Dockerfile
+++ b/AudioQnA/Dockerfile
@@ -1,48 +1,8 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p $HOME && \
-    chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG

 COPY ./audioqna.py $HOME/audioqna.py

--- a/AudioQnA/Dockerfile.multilang
+++ b/AudioQnA/Dockerfile.multilang
@@ -1,48 +1,8 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-# Stage 1: base setup used by other stages
-FROM python:3.11-slim AS base
-
-# get security updates
-RUN apt-get update && apt-get upgrade -y && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV HOME=/home/user
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p $HOME && \
-    chown -R user $HOME
-
-WORKDIR $HOME
-
-
-# Stage 2: latest GenAIComps sources
-FROM base AS git
-
-RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
-
-
-# Stage 3: common layer shared by services using GenAIComps
-FROM base AS comps-base
-
-# copy just relevant parts
-COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
-COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/
-
-WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
-WORKDIR $HOME
-
-ENV PYTHONPATH=$PYTHONPATH:$HOME/GenAIComps
-
-USER user
-
-
-# Stage 4: unique part
-FROM comps-base
+ARG BASE_TAG=latest
+FROM opea/comps-base:$BASE_TAG

 COPY ./audioqna_multilang.py $HOME/audioqna_multilang.py

--- a/AudioQnA/audioqna.py
+++ b/AudioQnA/audioqna.py
@@ -16,13 +16,14 @@ SPEECHT5_SERVER_HOST_IP = os.getenv("SPEECHT5_SERVER_HOST_IP", "0.0.0.0")
 SPEECHT5_SERVER_PORT = int(os.getenv("SPEECHT5_SERVER_PORT", 7055))
 LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
 LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 3006))
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")


 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
    if self.services[cur_node].service_type == ServiceType.LLM:
        # convert TGI/vLLM to unified OpenAI /v1/chat/completions format
        next_inputs = {}
-        next_inputs["model"] = "tgi"  # specifically clarify the fake model to make the format unified
+        next_inputs["model"] = LLM_MODEL_ID
        next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
        next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
        next_inputs["top_p"] = llm_parameters_dict["top_p"]
--- a/AudioQnA/audioqna_multilang.py
+++ b/AudioQnA/audioqna_multilang.py
@@ -17,6 +17,7 @@ GPT_SOVITS_SERVER_HOST_IP = os.getenv("GPT_SOVITS_SERVER_HOST_IP", "0.0.0.0")
 GPT_SOVITS_SERVER_PORT = int(os.getenv("GPT_SOVITS_SERVER_PORT", 9088))
 LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
 LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 8888))
+LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")


 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
@@ -24,7 +25,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
    if self.services[cur_node].service_type == ServiceType.LLM:
        # convert TGI/vLLM to unified OpenAI /v1/chat/completions format
        next_inputs = {}
-        next_inputs["model"] = "tgi"  # specifically clarify the fake model to make the format unified
+        next_inputs["model"] = LLM_MODEL_ID
        next_inputs["messages"] = [{"role": "user", "content": inputs["asr_result"]}]
        next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
        next_inputs["top_p"] = llm_parameters_dict["top_p"]
--- a/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AudioQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -69,6 +69,7 @@ services:
      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
+      - LLM_MODEL_ID=${LLM_MODEL_ID}
      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
    ipc: host
--- a/AudioQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/README.md
@@ -25,6 +25,9 @@ Intel Xeon optimized image hosted in huggingface repo will be used for TGI servi

 ```bash
 docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile .
+
+# multilang tts (optional)
+docker build -t opea/gpt-sovits:latest --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -f comps/tts/src/integrations/dependency/gpt-sovits/Dockerfile .
 ```

 ### 5. Build MegaService Docker Image
@@ -42,6 +45,7 @@ Then run the command `docker images`, you will have following images ready:
 1. `opea/whisper:latest`
 2. `opea/speecht5:latest`
 3. `opea/audioqna:latest`
+4. `opea/gpt-sovits:latest` (optional)

 ## 🚀 Set the environment variables

@@ -57,9 +61,11 @@ export MEGA_SERVICE_HOST_IP=${host_ip}
 export WHISPER_SERVER_HOST_IP=${host_ip}
 export SPEECHT5_SERVER_HOST_IP=${host_ip}
 export LLM_SERVER_HOST_IP=${host_ip}
+export GPT_SOVITS_SERVER_HOST_IP=${host_ip}

 export WHISPER_SERVER_PORT=7066
 export SPEECHT5_SERVER_PORT=7055
+export GPT_SOVITS_SERVER_PORT=9880
 export LLM_SERVER_PORT=3006

 export BACKEND_SERVICE_ENDPOINT=http://${host_ip}:3008/v1/audioqna
@@ -74,16 +80,20 @@ Note: Please replace with host_ip with your external IP address, do not use loca
 ```bash
 cd GenAIExamples/AudioQnA/docker_compose/intel/cpu/xeon/
 docker compose up -d
+
+# multilang tts (optional)
+docker compose -f compose_multilang.yaml up -d
 ```

 ## 🚀 Test MicroServices

 ```bash
 # whisper service
-curl http://${host_ip}:7066/v1/asr \
-  -X POST \
-  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
-  -H 'Content-Type: application/json'
+wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
+curl http://${host_ip}:7066/v1/audio/transcriptions \
+  -H "Content-Type: multipart/form-data" \
+  -F file="@./sample.wav" \
+  -F model="openai/whisper-small"

 # tgi service
 curl http://${host_ip}:3006/generate \
@@ -92,11 +102,10 @@ curl http://${host_ip}:3006/generate \
  -H 'Content-Type: application/json'

 # speecht5 service
-curl http://${host_ip}:7055/v1/tts \
-  -X POST \
-  -d '{"text": "Who are you?"}' \
-  -H 'Content-Type: application/json'
+curl http://${host_ip}:7055/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3

+# gpt-sovits service (optional)
+curl http://${host_ip}:9880/v1/audio/speech -XPOST -d '{"input": "Who are you?"}' -H 'Content-Type: application/json' --output speech.mp3
 ```

 ## 🚀 Test MegaService
@@ -106,7 +115,8 @@ base64 string to the megaservice endpoint. The megaservice will return a spoken
 to the response, decode the base64 string and save it as a .wav file.

 ```bash
-# voice can be "default" or "male"
+# if you are using speecht5 as the tts service, voice can be "default" or "male"
+# if you are using gpt-sovits for the tts service, you can set the reference audio following https://github.com/opea-project/GenAIComps/blob/main/comps/tts/src/integrations/dependency/gpt-sovits/README.md
 curl http://${host_ip}:3008/v1/audioqna \
  -X POST \
  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "max_tokens":64, "voice":"default"}' \
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -30,7 +30,7 @@ services:
    ports:
      - "3006:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -61,6 +61,7 @@ services:
      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
+      - LLM_MODEL_ID=${LLM_MODEL_ID}
      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
    ipc: host
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "3006:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -58,7 +58,20 @@ services:
      - GPT_SOVITS_SERVER_PORT=${GPT_SOVITS_SERVER_PORT}
    ipc: host
    restart: always
-
+  audioqna-xeon-ui-server:
+    image: ${REGISTRY:-opea}/audioqna-ui:${TAG:-latest}
+    container_name: audioqna-xeon-ui-server
+    depends_on:
+      - audioqna-xeon-backend-server
+    ports:
+      - "5175:5173"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - CHAT_URL=${BACKEND_SERVICE_ENDPOINT}
+    ipc: host
+    restart: always
 networks:
  default:
    driver: bridge
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -40,7 +40,7 @@ services:
    ports:
      - "3006:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
@@ -82,6 +82,7 @@ services:
      - WHISPER_SERVER_PORT=${WHISPER_SERVER_PORT}
      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
      - LLM_SERVER_PORT=${LLM_SERVER_PORT}
+      - LLM_MODEL_ID=${LLM_MODEL_ID}
      - SPEECHT5_SERVER_HOST_IP=${SPEECHT5_SERVER_HOST_IP}
      - SPEECHT5_SERVER_PORT=${SPEECHT5_SERVER_PORT}
    ipc: host
--- a/AudioQnA/tests/test_compose_on_gaudi.sh
+++ b/AudioQnA/tests/test_compose_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/AudioQnA/tests/test_compose_on_xeon.sh
+++ b/AudioQnA/tests/test_compose_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/AvatarChatbot/Dockerfile
+++ b/AvatarChatbot/Dockerfile
@@ -32,7 +32,7 @@ COPY --from=git $HOME/GenAIComps/comps $HOME/GenAIComps/comps
 COPY --from=git $HOME/GenAIComps/*.* $HOME/GenAIComps/LICENSE $HOME/GenAIComps/

 WORKDIR $HOME/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip && \
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
    pip install --no-cache-dir -r $HOME/GenAIComps/requirements.txt
 WORKDIR $HOME

--- a/AvatarChatbot/docker_compose/amd/gpu/rocm/README.md
+++ b/AvatarChatbot/docker_compose/amd/gpu/rocm/README.md
@@ -0,0 +1,209 @@
+# Build Mega Service of AvatarChatbot on AMD GPU
+
+This document outlines the deployment process for a AvatarChatbot application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server.
+
+## 🚀 Build Docker images
+
+### 1. Source Code install GenAIComps
+
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+```
+
+### 2. Build ASR Image
+
+```bash
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .
+
+
+docker build -t opea/asr:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/Dockerfile .
+```
+
+### 3. Build LLM Image
+
+```bash
+docker build --no-cache -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
+```
+
+### 4. Build TTS Image
+
+```bash
+docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile .
+
+docker build -t opea/tts:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/Dockerfile .
+```
+
+### 5. Build Animation Image
+
+```bash
+docker build -t opea/wav2lip:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/third_parties/wav2lip/src/Dockerfile .
+
+docker build -t opea/animation:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/animation/src/Dockerfile .
+```
+
+### 6. Build MegaService Docker Image
+
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `audioqna.py` Python script. Build the MegaService Docker image using the command below:
+
+```bash
+git clone https://github.com/opea-project/GenAIExamples.git
+cd GenAIExamples/AvatarChatbot/
+docker build --no-cache -t opea/avatarchatbot:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+
+Then run the command `docker images`, you will have following images ready:
+
+1. `opea/whisper:latest`
+2. `opea/asr:latest`
+3. `opea/llm-tgi:latest`
+4. `opea/speecht5:latest`
+5. `opea/tts:latest`
+6. `opea/wav2lip:latest`
+7. `opea/animation:latest`
+8. `opea/avatarchatbot:latest`
+
+## 🚀 Set the environment variables
+
+Before starting the services with `docker compose`, you have to recheck the following environment variables.
+
+```bash
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export host_ip=$(hostname -I | awk '{print $1}')
+
+export TGI_SERVICE_PORT=3006
+export TGI_LLM_ENDPOINT=http://${host_ip}:${TGI_SERVICE_PORT}
+export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+
+export ASR_ENDPOINT=http://${host_ip}:7066
+export TTS_ENDPOINT=http://${host_ip}:7055
+export WAV2LIP_ENDPOINT=http://${host_ip}:7860
+
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export ASR_SERVICE_HOST_IP=${host_ip}
+export TTS_SERVICE_HOST_IP=${host_ip}
+export LLM_SERVICE_HOST_IP=${host_ip}
+export ANIMATION_SERVICE_HOST_IP=${host_ip}
+
+export MEGA_SERVICE_PORT=8888
+export ASR_SERVICE_PORT=3001
+export TTS_SERVICE_PORT=3002
+export LLM_SERVICE_PORT=3007
+export ANIMATION_SERVICE_PORT=3008
+
+export DEVICE="cpu"
+export WAV2LIP_PORT=7860
+export INFERENCE_MODE='wav2lip+gfpgan'
+export CHECKPOINT_PATH='/usr/local/lib/python3.11/site-packages/Wav2Lip/checkpoints/wav2lip_gan.pth'
+export FACE="assets/img/avatar5.png"
+# export AUDIO='assets/audio/eg3_ref.wav' # audio file path is optional, will use base64str in the post request as input if is 'None'
+export AUDIO='None'
+export FACESIZE=96
+export OUTFILE="/outputs/result.mp4"
+export GFPGAN_MODEL_VERSION=1.4 # latest version, can roll back to v1.3 if needed
+export UPSCALE_FACTOR=1
+export FPS=10
+```
+
+Warning!!! - The Wav2lip service works in this solution using only the CPU. To use AMD GPUs and achieve operational performance, the Wav2lip image needs to be modified to adapt to AMD hardware and the ROCm framework.
+
+## 🚀 Start the MegaService
+
+```bash
+cd GenAIExamples/AvatarChatbot/docker_compose/intel/cpu/xeon/
+docker compose -f compose.yaml up -d
+```
+
+## 🚀 Test MicroServices
+
+```bash
+# whisper service
+curl http://${host_ip}:7066/v1/asr \
+  -X POST \
+  -d '{"audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
+  -H 'Content-Type: application/json'
+
+# asr microservice
+curl http://${host_ip}:3001/v1/audio/transcriptions \
+  -X POST \
+  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
+  -H 'Content-Type: application/json'
+
+# tgi service
+curl http://${host_ip}:3006/generate \
+  -X POST \
+  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
+  -H 'Content-Type: application/json'
+
+# llm microservice
+curl http://${host_ip}:3007/v1/chat/completions\
+  -X POST \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
+  -H 'Content-Type: application/json'
+
+# speecht5 service
+curl http://${host_ip}:7055/v1/tts \
+  -X POST \
+  -d '{"text": "Who are you?"}' \
+  -H 'Content-Type: application/json'
+
+# tts microservice
+curl http://${host_ip}:3002/v1/audio/speech \
+  -X POST \
+  -d '{"text": "Who are you?"}' \
+  -H 'Content-Type: application/json'
+
+# wav2lip service
+cd ../../../..
+curl http://${host_ip}:7860/v1/wav2lip \
+  -X POST \
+  -d @assets/audio/sample_minecraft.json \
+  -H 'Content-Type: application/json'
+
+# animation microservice
+curl http://${host_ip}:3008/v1/animation \
+  -X POST \
+  -d @assets/audio/sample_question.json \
+  -H "Content-Type: application/json"
+
+```
+
+## 🚀 Test MegaService
+
+```bash
+curl http://${host_ip}:3009/v1/avatarchatbot \
+  -X POST \
+  -d @assets/audio/sample_whoareyou.json \
+  -H 'Content-Type: application/json'
+```
+
+If the megaservice is running properly, you should see the following output:
+
+```bash
+"/outputs/result.mp4"
+```
+
+The output file will be saved in the current working directory, as `${PWD}` is mapped to `/outputs` inside the wav2lip-service Docker container.
+
+## Gradio UI
+
+```bash
+cd $WORKPATH/GenAIExamples/AvatarChatbot
+python3 ui/gradio/app_gradio_demo_avatarchatbot.py
+```
+
+The UI can be viewed at http://${host_ip}:7861  
+<img src="../../../../assets/img/UI.png" alt="UI Example" width="60%">  
+In the current version v1.0, you need to set the avatar figure image/video and the DL model choice in the environment variables before starting AvatarChatbot backend service and running the UI. Please just customize the audio question in the UI.  
+\*\* We will enable change of avatar figure between runs in v2.0
+
+## Troubleshooting
+
+```bash
+cd GenAIExamples/AvatarChatbot/tests
+export IMAGE_REPO="opea"
+export IMAGE_TAG="latest"
+export HUGGINGFACEHUB_API_TOKEN=<your_hf_token>
+
+test_avatarchatbot_on_xeon.sh
+```
--- a/AvatarChatbot/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/AvatarChatbot/docker_compose/amd/gpu/rocm/compose.yaml
@@ -0,0 +1,158 @@
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  whisper-service:
+    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+    container_name: whisper-service
+    ports:
+      - "7066:7066"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+  asr:
+    image: ${REGISTRY:-opea}/asr:${TAG:-latest}
+    container_name: asr-service
+    ports:
+      - "3001:9099"
+    ipc: host
+    environment:
+      ASR_ENDPOINT: ${ASR_ENDPOINT}
+  speecht5-service:
+    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
+    container_name: speecht5-service
+    ports:
+      - "7055:7055"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+  tts:
+    image: ${REGISTRY:-opea}/tts:${TAG:-latest}
+    container_name: tts-service
+    ports:
+      - "3002:9088"
+    ipc: host
+    environment:
+      TTS_ENDPOINT: ${TTS_ENDPOINT}
+  tgi-service:
+    image: ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+    container_name: tgi-service
+    ports:
+      - "${TGI_SERVICE_PORT:-3006}:80"
+    volumes:
+      - "./data:/data"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+    shm_size: 1g
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri/:/dev/dri/
+    cap_add:
+      - SYS_PTRACE
+    group_add:
+      - video
+    security_opt:
+      - seccomp:unconfined
+    ipc: host
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192
+  llm:
+    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
+    container_name: llm-tgi-server
+    depends_on:
+      - tgi-service
+    ports:
+      - "3007:9000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+      LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      OPENAI_API_KEY: ${OPENAI_API_KEY}
+    restart: unless-stopped
+  wav2lip-service:
+    image: ${REGISTRY:-opea}/wav2lip:${TAG:-latest}
+    container_name: wav2lip-service
+    ports:
+      - "7860:7860"
+    ipc: host
+    volumes:
+      - ${PWD}:/outputs
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      DEVICE: ${DEVICE}
+      INFERENCE_MODE: ${INFERENCE_MODE}
+      CHECKPOINT_PATH: ${CHECKPOINT_PATH}
+      FACE: ${FACE}
+      AUDIO: ${AUDIO}
+      FACESIZE: ${FACESIZE}
+      OUTFILE: ${OUTFILE}
+      GFPGAN_MODEL_VERSION: ${GFPGAN_MODEL_VERSION}
+      UPSCALE_FACTOR: ${UPSCALE_FACTOR}
+      FPS: ${FPS}
+      WAV2LIP_PORT: ${WAV2LIP_PORT}
+    restart: unless-stopped
+  animation:
+    image: ${REGISTRY:-opea}/animation:${TAG:-latest}
+    container_name: animation-server
+    ports:
+      - "3008:9066"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      WAV2LIP_ENDPOINT: ${WAV2LIP_ENDPOINT}
+    restart: unless-stopped
+  avatarchatbot-backend-server:
+    image: ${REGISTRY:-opea}/avatarchatbot:${TAG:-latest}
+    container_name: avatarchatbot-backend-server
+    depends_on:
+      - asr
+      - llm
+      - tts
+      - animation
+    ports:
+      - "3009:8888"
+    environment:
+      no_proxy: ${no_proxy}
+      https_proxy: ${https_proxy}
+      http_proxy: ${http_proxy}
+      MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP}
+      MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT}
+      ASR_SERVICE_HOST_IP: ${ASR_SERVICE_HOST_IP}
+      ASR_SERVICE_PORT: ${ASR_SERVICE_PORT}
+      LLM_SERVICE_HOST_IP: ${LLM_SERVICE_HOST_IP}
+      LLM_SERVICE_PORT: ${LLM_SERVICE_PORT}
+      LLM_SERVER_HOST_IP: ${LLM_SERVICE_HOST_IP}
+      LLM_SERVER_PORT: ${LLM_SERVICE_PORT}
+      TTS_SERVICE_HOST_IP: ${TTS_SERVICE_HOST_IP}
+      TTS_SERVICE_PORT: ${TTS_SERVICE_PORT}
+      ANIMATION_SERVICE_HOST_IP: ${ANIMATION_SERVICE_HOST_IP}
+      ANIMATION_SERVICE_PORT: ${ANIMATION_SERVICE_PORT}
+      WHISPER_SERVER_HOST_IP: ${WHISPER_SERVER_HOST_IP}
+      WHISPER_SERVER_PORT: ${WHISPER_SERVER_PORT}
+      SPEECHT5_SERVER_HOST_IP: ${SPEECHT5_SERVER_HOST_IP}
+      SPEECHT5_SERVER_PORT: ${SPEECHT5_SERVER_PORT}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/AvatarChatbot/docker_compose/amd/gpu/rocm/set_env.sh
+++ b/AvatarChatbot/docker_compose/amd/gpu/rocm/set_env.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export OPENAI_API_KEY=${OPENAI_API_KEY}
+export host_ip=$(hostname -I | awk '{print $1}')
+
+export TGI_SERVICE_PORT=3006
+export TGI_LLM_ENDPOINT=http://${host_ip}:${TGI_SERVICE_PORT}
+export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+
+export ASR_ENDPOINT=http://${host_ip}:7066
+export TTS_ENDPOINT=http://${host_ip}:7055
+export WAV2LIP_ENDPOINT=http://${host_ip}:7860
+
+export WHISPER_SERVER_HOST_IP=${host_ip}
+export WHISPER_SERVER_PORT=7066
+
+export SPEECHT5_SERVER_HOST_IP=${host_ip}
+export SPEECHT5_SERVER_PORT=7055
+
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export ASR_SERVICE_HOST_IP=${host_ip}
+export TTS_SERVICE_HOST_IP=${host_ip}
+export LLM_SERVICE_HOST_IP=${host_ip}
+export ANIMATION_SERVICE_HOST_IP=${host_ip}
+
+export MEGA_SERVICE_PORT=8888
+export ASR_SERVICE_PORT=3001
+export TTS_SERVICE_PORT=3002
+export LLM_SERVICE_PORT=3007
+export ANIMATION_SERVICE_PORT=3008
+
+export DEVICE="cpu"
+export WAV2LIP_PORT=7860
+export INFERENCE_MODE='wav2lip+gfpgan'
+export CHECKPOINT_PATH='/usr/local/lib/python3.11/site-packages/Wav2Lip/checkpoints/wav2lip_gan.pth'
+export FACE="/home/user/comps/animation/src/assets/img/avatar5.png"
+# export AUDIO='assets/audio/eg3_ref.wav' # audio file path is optional, will use base64str in the post request as input if is 'None'
+export AUDIO='None'
+export FACESIZE=96
+export OUTFILE="/outputs/result.mp4"
+export GFPGAN_MODEL_VERSION=1.4 # latest version, can roll back to v1.3 if needed
+export UPSCALE_FACTOR=1
+export FPS=10
--- a/AvatarChatbot/tests/test_compose_on_rocm.sh
+++ b/AvatarChatbot/tests/test_compose_on_rocm.sh
@@ -0,0 +1,170 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+if ls $LOG_PATH/*.log 1> /dev/null 2>&1; then
+    rm $LOG_PATH/*.log
+    echo "Log files removed."
+else
+    echo "No log files to remove."
+fi
+ip_address=$(hostname -I | awk '{print $1}')
+
+
+function build_docker_images() {
+    cd $WORKPATH/docker_image_build
+    git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="avatarchatbot whisper asr llm-textgen speecht5 tts wav2lip animation"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+    docker pull ghcr.io/huggingface/text-generation-inference:2.3.1-rocm
+
+    docker images && sleep 3s
+}
+
+
+function start_services() {
+    cd $WORKPATH/docker_compose/amd/gpu/rocm
+
+    export HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN
+    export OPENAI_API_KEY=$OPENAI_API_KEY
+    export host_ip=${ip_address}
+
+    export TGI_SERVICE_PORT=3006
+    export TGI_LLM_ENDPOINT=http://${host_ip}:${TGI_SERVICE_PORT}
+    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+
+    export ASR_ENDPOINT=http://${host_ip}:7066
+    export TTS_ENDPOINT=http://${host_ip}:7055
+    export WAV2LIP_ENDPOINT=http://${host_ip}:7860
+
+    export MEGA_SERVICE_HOST_IP=${host_ip}
+    export ASR_SERVICE_HOST_IP=${host_ip}
+    export TTS_SERVICE_HOST_IP=${host_ip}
+    export LLM_SERVICE_HOST_IP=${host_ip}
+    export ANIMATION_SERVICE_HOST_IP=${host_ip}
+    export WHISPER_SERVER_HOST_IP=${host_ip}
+    export WHISPER_SERVER_PORT=7066
+
+    export SPEECHT5_SERVER_HOST_IP=${host_ip}
+    export SPEECHT5_SERVER_PORT=7055
+
+    export MEGA_SERVICE_PORT=8888
+    export ASR_SERVICE_PORT=3001
+    export TTS_SERVICE_PORT=3002
+    export LLM_SERVICE_PORT=3007
+    export ANIMATION_SERVICE_PORT=3008
+
+    export DEVICE="cpu"
+    export WAV2LIP_PORT=7860
+    export INFERENCE_MODE='wav2lip+gfpgan'
+    export CHECKPOINT_PATH='/usr/local/lib/python3.11/site-packages/Wav2Lip/checkpoints/wav2lip_gan.pth'
+    export FACE="/home/user/comps/animation/src/assets/img/avatar5.png"
+    # export AUDIO='assets/audio/eg3_ref.wav' # audio file path is optional, will use base64str in the post request as input if is 'None'
+    export AUDIO='None'
+    export FACESIZE=96
+    export OUTFILE="./outputs/result.mp4"
+    export GFPGAN_MODEL_VERSION=1.4 # latest version, can roll back to v1.3 if needed
+    export UPSCALE_FACTOR=1
+    export FPS=5
+
+    # Start Docker Containers
+    docker compose up -d --force-recreate
+
+    echo "Check tgi-service status"
+
+    n=0
+    until [[ "$n" -ge 100 ]]; do
+       docker logs tgi-service > $LOG_PATH/tgi_service_start.log
+       if grep -q Connected $LOG_PATH/tgi_service_start.log; then
+           break
+       fi
+       sleep 5s
+       n=$((n+1))
+    done
+    echo "tgi-service are up and running"
+    sleep 5s
+
+    echo "Check wav2lip-service status"
+
+    n=0
+    until [[ "$n" -ge 100 ]]; do
+       docker logs wav2lip-service >& $LOG_PATH/wav2lip-service_start.log
+       if grep -q "Application startup complete" $LOG_PATH/wav2lip-service_start.log; then
+           break
+       fi
+       sleep 5s
+       n=$((n+1))
+    done
+    echo "wav2lip-service are up and running"
+    sleep 5s
+}
+
+
+function validate_megaservice() {
+    cd $WORKPATH
+    ls
+    result=$(http_proxy="" curl http://${ip_address}:3009/v1/avatarchatbot -X POST -d @assets/audio/sample_whoareyou.json -H 'Content-Type: application/json')
+    echo "result is === $result"
+    if [[ $result == *"mp4"* ]]; then
+        echo "Result correct."
+    else
+        docker logs whisper-service > $LOG_PATH/whisper-service.log
+        docker logs asr-service > $LOG_PATH/asr-service.log
+        docker logs speecht5-service > $LOG_PATH/speecht5-service.log
+        docker logs tts-service > $LOG_PATH/tts-service.log
+        docker logs tgi-service > $LOG_PATH/tgi-service.log
+        docker logs llm-tgi-server > $LOG_PATH/llm-tgi-server.log
+        docker logs wav2lip-service > $LOG_PATH/wav2lip-service.log
+        docker logs animation-server > $LOG_PATH/animation-server.log
+
+        echo "Result wrong."
+        exit 1
+    fi
+
+}
+
+
+#function validate_frontend() {
+
+#}
+
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/amd/gpu/rocm
+    docker compose down && docker compose rm -f
+}
+
+
+function main() {
+
+    echo $OPENAI_API_KEY
+    echo $OPENAI_KEY
+
+    stop_docker
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_services
+    # validate_microservices
+    sleep 30
+    validate_megaservice
+    # validate_frontend
+    stop_docker
+
+    echo y | docker system prune
+
+}
+
+
+main
--- a/ChatQnA/README.md
+++ b/ChatQnA/README.md
@@ -68,13 +68,16 @@ To set up environment variables for deploying ChatQnA services, follow these ste

   ```bash
   # on Gaudi
-   source ./docker_compose/intel/hpu/gaudi/set_env.sh
+   cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
+   source ./set_env.sh
   export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails
   # on Xeon
-   source ./docker_compose/intel/cpu/xeon/set_env.sh
+   cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+   source ./set_env.sh
   export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service
   # on Nvidia GPU
-   source ./docker_compose/nvidia/gpu/set_env.sh
+   cd GenAIExamples/ChatQnA/docker_compose/nvidia/gpu
+   source ./set_env.sh
   export no_proxy="Your_No_Proxy",chatqna-ui-server,chatqna-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service
   ```

@@ -91,6 +94,14 @@ cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
 docker compose up -d
 ```

+To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.  
+CPU example with Open Telemetry feature:
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+```
+
 It will automatically download the docker image on `docker hub`:

 ```bash
@@ -232,6 +243,13 @@ cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
 docker compose up -d
 ```

+To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+```
+
 Refer to the [Gaudi Guide](./docker_compose/intel/hpu/gaudi/README.md) to build docker images from source.

 ### Deploy ChatQnA on Xeon
@@ -243,6 +261,13 @@ cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
 docker compose up -d
 ```

+To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+```
+
 Refer to the [Xeon Guide](./docker_compose/intel/cpu/xeon/README.md) for more instructions on building docker images from source.

 ### Deploy ChatQnA on NVIDIA GPU
@@ -346,7 +371,7 @@ OPEA microservice deployment can easily be monitored through Grafana dashboards

 ## Tracing Services with OpenTelemetry Tracing and Jaeger

-> NOTE: limited support. Only LLM inference serving with TGI on Gaudi is enabled for this feature.
+> NOTE: This feature is disabled by default. Please check the Deploy ChatQnA sessions for how to enable this feature with compose.telemetry.yaml file.

 OPEA microservice and TGI/TEI serving can easily be traced through Jaeger dashboards in conjunction with OpenTelemetry Tracing feature. Follow the [README](https://github.com/opea-project/GenAIComps/tree/main/comps/cores/telemetry#tracing) to trace additional functions if needed.

@@ -357,8 +382,17 @@ Users could also get the external IP via below command.
 ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+'
 ```

+Access the Jaeger dashboard UI at http://{EXTERNAL_IP}:16686
+
 For TGI serving on Gaudi, users could see different services like opea, TEI and TGI.
 ![Screenshot from 2024-12-27 11-58-18](https://github.com/user-attachments/assets/6126fa70-e830-4780-bd3f-83cb6eff064e)

 Here is a screenshot for one tracing of TGI serving request.
 ![Screenshot from 2024-12-27 11-26-25](https://github.com/user-attachments/assets/3a7c51c6-f422-41eb-8e82-c3df52cd48b8)
+
+There are also OPEA related tracings. Users could understand the time breakdown of each service request by looking into each opea:schedule operation.
+![image](https://github.com/user-attachments/assets/6137068b-b374-4ff8-b345-993343c0c25f)
+
+There could be async function such as `llm/MicroService_asyn_generate` and user needs to check the trace of the async function in another operation like
+opea:llm_generate_stream.
+![image](https://github.com/user-attachments/assets/a973d283-198f-4ce2-a7eb-58515b77503e)
--- a/ChatQnA/benchmark_chatqna.yaml
+++ b/ChatQnA/benchmark_chatqna.yaml
@@ -0,0 +1,112 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+deploy:
+  device: gaudi
+  version: 1.2.0
+  modelUseHostPath: /mnt/models
+  HUGGINGFACEHUB_API_TOKEN: "" # mandatory
+  node: [1, 2, 4, 8]
+  namespace: ""
+  timeout: 1000  # timeout in seconds for services to be ready, default 30 minutes
+  interval: 5    # interval in seconds between service ready checks, default 5 seconds
+
+  services:
+    backend:
+      resources:
+        enabled: False
+        cores_per_instance: "16"
+        memory_capacity: "8000Mi"
+      replicaCount: [1, 2, 4, 8]
+
+    teirerank:
+      enabled: True
+      model_id: ""
+      resources:
+        enabled: False
+        cards_per_instance: 1
+      replicaCount: [1, 1, 1, 1]
+
+    tei:
+      model_id: ""
+      resources:
+        enabled: False
+        cores_per_instance: "80"
+        memory_capacity: "20000Mi"
+      replicaCount: [1, 2, 4, 8]
+
+    llm:
+      engine: vllm  # or tgi
+      model_id: "meta-llama/Meta-Llama-3-8B-Instruct" # mandatory
+      replicaCount:
+        with_teirerank: [7, 15, 31, 63]     # When teirerank.enabled is True
+        without_teirerank: [8, 16, 32, 64]   # When teirerank.enabled is False
+      resources:
+        enabled: False
+        cards_per_instance: 1
+      model_params:
+        vllm:  # VLLM specific parameters
+          batch_params:
+            enabled: True
+            max_num_seqs: [1, 2, 4, 8]    # Each value triggers an LLM service upgrade
+          token_params:
+            enabled: False
+            max_input_length: ""
+            max_total_tokens: ""
+            max_batch_total_tokens: ""
+            max_batch_prefill_tokens: ""
+        tgi:   # TGI specific parameters
+          batch_params:
+            enabled: True
+            max_batch_size: [1, 2, 4, 8]  # Each value triggers an LLM service upgrade
+          token_params:
+            enabled: False
+            max_input_length: "1280"
+            max_total_tokens: "2048"
+            max_batch_total_tokens: "65536"
+            max_batch_prefill_tokens: "4096"
+
+    data-prep:
+      resources:
+        enabled: False
+        cores_per_instance: ""
+        memory_capacity: ""
+      replicaCount: [1, 1, 1, 1]
+
+    retriever-usvc:
+      resources:
+        enabled: False
+        cores_per_instance: "8"
+        memory_capacity: "8000Mi"
+      replicaCount: [1, 2, 4, 8]
+
+    redis-vector-db:
+      resources:
+        enabled: False
+        cores_per_instance: ""
+        memory_capacity: ""
+      replicaCount: [1, 1, 1, 1]
+
+    chatqna-ui:
+      replicaCount: [1, 1, 1, 1]
+
+    nginx:
+      replicaCount: [1, 1, 1, 1]
+
+benchmark:
+  # http request behavior related fields
+  user_queries:              [640]
+  concurrency:               [128]
+  load_shape_type:           "constant" # "constant" or "poisson"
+  poisson_arrival_rate:      1.0  # only used when load_shape_type is "poisson"
+  warmup_iterations:         10
+  seed:                      1024
+
+  # workload, all of the test cases will run for benchmark
+  bench_target: [chatqnafixed, chatqna_qlist_pubmed] # specify the bench_target for benchmark
+  dataset: ["/home/sdp/upload_file.txt", "/home/sdp/pubmed_10000.txt"]  # specify the absolute path to the dataset file
+  prompt: [10, 1000]  # set the prompt length for the chatqna_qlist_pubmed workload, set to 10 for chatqnafixed workload
+
+  llm:
+    # specify the llm output token size
+    max_token_size:          [128, 256]
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -167,7 +167,7 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di


 def align_generator(self, gen, **kwargs):
-    # openai reaponse format
+    # OpenAI response format
    # b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n'
    for line in gen:
        line = line.decode("utf-8")
--- a/ChatQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/README.md
@@ -1,4 +1,4 @@
-# Build and deploy CodeGen Application on AMD GPU (ROCm)
+# Build and deploy ChatQnA Application on AMD GPU (ROCm)

 ## Build MegaService of ChatQnA on AMD ROCm GPU

--- a/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/ChatQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -30,7 +30,7 @@ services:
    ports:
      - "${CHATQNA_TEI_EMBEDDING_PORT}:80"
    volumes:
-      - "/var/opea/chatqna-service/data:/data"
+      - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
    shm_size: 1g
    ipc: host
    environment:
@@ -72,7 +72,7 @@ services:
    ports:
      - "${CHATQNA_TEI_RERANKING_PORT}:80"
    volumes:
-      - "/var/opea/chatqna-service/data:/data"
+      - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -104,7 +104,7 @@ services:
      HF_HUB_DISABLE_PROGRESS_BARS: 1
      HF_HUB_ENABLE_HF_TRANSFER: 0
    volumes:
-      - "/var/opea/chatqna-service/data:/data"
+      - "${MODEL_CACHE:-/var/opea/chatqna-service/data}:/data"
    shm_size: 1g
    devices:
      - /dev/kfd:/dev/kfd
--- a/ChatQnA/docker_compose/intel/cpu/aipc/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/aipc/README.md
@@ -2,6 +2,84 @@

 This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on AIPC. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`.

+## Quick Start:
+
+1. Set up the environment variables.
+2. Run Docker Compose.
+3. Consume the ChatQnA Service.
+
+### Quick Start: 1. Set up Environment Variable
+
+To set up environment variables for deploying ChatQnA services, follow these steps:
+
+```bash
+mkdir ~/OPEA -p
+cd ~/OPEA
+git clone https://github.com/opea-project/GenAIExamples.git
+cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/aipc
+```
+
+1. Set the required environment variables:
+
+   ```bash
+   export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
+   ```
+
+2. If you are in a proxy environment, also set the proxy-related environment variables:
+
+   ```bash
+   export https_proxy="Your_HTTPs_Proxy"
+   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+   export no_proxy=$no_proxy,chatqna-aipc-backend-server,tei-embedding-service,retriever,tei-reranking-service,redis-vector-db,dataprep-redis-service,ollama-service
+   ```
+
+3. Set up other environment variables
+
+   By default, llama3.2 is used for LLM serving, the default model can be changed to other LLM models. Please pick a [validated llm models](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/src/text-generation#validated-llm-models) from the table.  
+   To change the default model defined in set_env.sh, overwrite it by exporting OLLAMA_MODEL to the new model or by modifying set_env.sh.  
+   For example, change to using the following model.
+
+   ```bash
+   export OLLAMA_MODEL="deepseek-r1:8b"
+   ```
+
+   to use the [DeepSeek-R1-Distill-Llama-8B model](https://ollama.com/library/deepseek-r1:8b)
+
+   ```bash
+   source ./set_env.sh
+   ```
+
+### Quick Start: 2. Run Docker Compose
+
+```bash
+    docker compose up -d
+```
+
+It will take several minutes to automatically download the docker images
+
+NB: You should build docker image from source by yourself if:
+
+- You are developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
+- You can't download the docker image.
+- You want to use a specific version of Docker image.
+
+Please refer to ['Build Docker Images'](#🚀-build-docker-images) in below.
+
+### Quick Start:3. Consume the ChatQnA Service
+
+Once the services are up, open the following URL from your browser: http://{host_ip}:80.
+Enter Prompt like What is deep learning?
+
+Or if you prefer to try only on the localhost machine, then try
+
+```bash
+curl http://${host_ip}:8888/v1/chatqna \
+    -H "Content-Type: application/json" \
+    -d '{
+        "messages": "What is deep learning?"
+    }'
+```
+
 ## 🚀 Build Docker Images

 First of all, you need to build Docker Images locally and install the python package of it.
@@ -82,18 +160,18 @@ export host_ip="External_Public_IP"

 For Linux users, please run `hostname -I | awk '{print $1}'`. For Windows users, please run `ipconfig | findstr /i "IPv4"` to get the external public ip.

-**Export the value of your Huggingface API token to the `your_hf_api_token` environment variable**
+**Export the value of your Huggingface API token to the `HUGGINGFACEHUB_API_TOKEN` environment variable**

 > Change the Your_Huggingface_API_Token below with tyour actual Huggingface API Token value

 ```
-export your_hf_api_token="Your_Huggingface_API_Token"
+export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
 ```

 **Append the value of the public IP address to the no_proxy list if you are in a proxy environment**

 ```
-export your_no_proxy=${your_no_proxy},"External_Public_IP",chatqna-aipc-backend-server,tei-embedding-service,retriever,tei-reranking-service,redis-vector-db,dataprep-redis-service
+export your_no_proxy=${your_no_proxy},"External_Public_IP",chatqna-aipc-backend-server,tei-embedding-service,retriever,tei-reranking-service,redis-vector-db,dataprep-redis-service,ollama-service
 ```

 - Linux PC
@@ -105,7 +183,7 @@ export https_proxy=${your_http_proxy}
 export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export RERANK_MODEL_ID="BAAI/bge-reranker-base"
 export INDEX_NAME="rag-redis"
-export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export OLLAMA_HOST=${host_ip}
 export OLLAMA_MODEL="llama3.2"
 ```
@@ -116,7 +194,7 @@ export OLLAMA_MODEL="llama3.2"
 set EMBEDDING_MODEL_ID=BAAI/bge-base-en-v1.5
 set RERANK_MODEL_ID=BAAI/bge-reranker-base
 set INDEX_NAME=rag-redis
-set HUGGINGFACEHUB_API_TOKEN=%your_hf_api_token%
+set HUGGINGFACEHUB_API_TOKEN=%HUGGINGFACEHUB_API_TOKEN%
 set OLLAMA_HOST=host.docker.internal
 set OLLAMA_MODEL="llama3.2"
 ```
--- a/ChatQnA/docker_compose/intel/cpu/aipc/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/aipc/compose.yaml
@@ -109,7 +109,7 @@ services:
      - RETRIEVER_SERVICE_HOST_IP=retriever
      - RERANK_SERVER_HOST_IP=tei-reranking-service
      - RERANK_SERVER_PORT=80
-      - LLM_SERVER_HOST_IP=${OLLAMA_HOST}
+      - LLM_SERVER_HOST_IP=ollama-service
      - LLM_SERVER_PORT=11434
      - LLM_MODEL=${OLLAMA_MODEL}
      - LOGFLAG=${LOGFLAG}
--- a/ChatQnA/docker_compose/intel/cpu/aipc/set_env.sh
+++ b/ChatQnA/docker_compose/intel/cpu/aipc/set_env.sh
@@ -7,15 +7,17 @@ pushd "../../../../../" > /dev/null
 source .set_env.sh
 popd > /dev/null

-if [ -z "${your_hf_api_token}" ]; then
-    echo "Error: HUGGINGFACEHUB_API_TOKEN is not set. Please set your_hf_api_token."
+export host_ip=$(hostname -I | awk '{print $1}')
+
+if [ -z "${HUGGINGFACEHUB_API_TOKEN}" ]; then
+    echo "Error: HUGGINGFACEHUB_API_TOKEN is not set. Please set HUGGINGFACEHUB_API_TOKEN."
 fi

 if [ -z "${host_ip}" ]; then
    echo "Error: host_ip is not set. Please set host_ip first."
 fi

-export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export RERANK_MODEL_ID="BAAI/bge-reranker-base"
 export INDEX_NAME="rag-redis"
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -34,16 +34,36 @@ To set up environment variables for deploying ChatQnA services, follow these ste
   ```

 3. Set up other environment variables:
+
   ```bash
   source ./set_env.sh
   ```

+4. Change Model for LLM serving
+
+   By default, Meta-Llama-3-8B-Instruct is used for LLM serving, the default model can be changed to other validated LLM models.  
+   Please pick a [validated llm models](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/src/text-generation#validated-llm-models) from the table.  
+   To change the default model defined in set_env.sh, overwrite it by exporting LLM_MODEL_ID to the new model or by modifying set_env.sh, and then repeat step 3.  
+   For example, change to Llama-2-7b-chat-hf using the following command.
+
+   ```bash
+   export LLM_MODEL_ID="meta-llama/Llama-2-7b-chat-hf"
+   ```
+
 ## Quick Start: 2.Run Docker Compose

 ```bash
 docker compose up -d
 ```

+To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.  
+CPU example with Open Telemetry feature:
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+```
+
 It will automatically download the docker image on `docker hub`:

 ```bash
@@ -263,12 +283,16 @@ If use vLLM as the LLM serving backend.
 docker compose -f compose.yaml up -d
 # Start ChatQnA without Rerank Pipeline
 docker compose -f compose_without_rerank.yaml up -d
+# Start ChatQnA with Rerank Pipeline and Open Telemetry Tracing
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
 ```

 If use TGI as the LLM serving backend.

 ```bash
 docker compose -f compose_tgi.yaml up -d
+# Start ChatQnA with Open Telemetry Tracing
+docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml up -d
 ```

 ### Validate Microservices
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml
@@ -0,0 +1,27 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  tei-embedding-service:
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  tei-reranking-service:
+    command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  jaeger:
+    image: jaegertracing/all-in-one:latest
+    container_name: jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
+      - "9411:9411"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      COLLECTOR_ZIPKIN_HOST_PORT: 9411
+    restart: unless-stopped
+  chatqna-xeon-backend-server:
+    environment:
+      - ENABLE_OPEA_TELEMETRY=true
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "6006:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
    ports:
      - "8808:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -80,7 +80,7 @@ services:
    ports:
      - "9009:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_milvus.yaml
@@ -0,0 +1,227 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+services:
+  etcd:
+    container_name: milvus-etcd
+    image: quay.io/coreos/etcd:v3.5.5
+    environment:
+      - ETCD_AUTO_COMPACTION_MODE=revision
+      - ETCD_AUTO_COMPACTION_RETENTION=1000
+      - ETCD_QUOTA_BACKEND_BYTES=4294967296
+      - ETCD_SNAPSHOT_COUNT=50000
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
+    command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
+    healthcheck:
+      test: ["CMD", "etcdctl", "endpoint", "health"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+
+  minio:
+    container_name: milvus-minio
+    image: minio/minio:RELEASE.2023-03-20T20-16-18Z
+    environment:
+      MINIO_ACCESS_KEY: minioadmin
+      MINIO_SECRET_KEY: minioadmin
+    ports:
+      - "${MINIO_PORT1:-5044}:9001"
+      - "${MINIO_PORT2:-5043}:9000"
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
+    command: minio server /minio_data --console-address ":9001"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+
+  milvus-standalone:
+    container_name: milvus-standalone
+    image: milvusdb/milvus:v2.4.6
+    command: ["milvus", "run", "standalone"]
+    security_opt:
+      - seccomp:unconfined
+    environment:
+      ETCD_ENDPOINTS: etcd:2379
+      MINIO_ADDRESS: minio:9000
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/milvus.yaml:/milvus/configs/milvus.yaml
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
+      interval: 30s
+      start_period: 90s
+      timeout: 20s
+      retries: 3
+    ports:
+      - "19530:19530"
+      - "${MILVUS_STANDALONE_PORT:-9091}:9091"
+    depends_on:
+      - "etcd"
+      - "minio"
+
+  dataprep-milvus-service:
+    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
+    container_name: dataprep-milvus-server
+    ports:
+      - "${DATAPREP_PORT:-11101}:5000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_MILVUS"
+      MILVUS_HOST: ${host_ip}
+      MILVUS_PORT: 19530
+      TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      EMBEDDING_MODEL_ID: ${EMBEDDING_MODEL_ID}
+      LOGFLAG: ${LOGFLAG}
+    restart: unless-stopped
+    depends_on:
+      milvus-standalone:
+        condition: service_healthy
+      etcd:
+        condition: service_healthy
+      minio:
+        condition: service_healthy
+
+  retriever:
+    image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
+    container_name: retriever-milvus-server
+    depends_on:
+      - milvus-standalone
+    ports:
+      - "7000:7000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      MILVUS_HOST: ${host_ip}
+      MILVUS_PORT: 19530
+      TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LOGFLAG: ${LOGFLAG}
+      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_MILVUS"
+    restart: unless-stopped
+
+  tei-embedding-service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    container_name: tei-embedding-server
+    ports:
+      - "6006:80"
+    volumes:
+      - "./data:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
+
+  tei-reranking-service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    container_name: tei-reranking-server
+    ports:
+      - "8808:80"
+    volumes:
+      - "./data:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+    command: --model-id ${RERANK_MODEL_ID} --auto-truncate
+
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+    container_name: vllm-service
+    ports:
+      - "9009:80"
+    volumes:
+      - "./data:/data"
+    shm_size: 128g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://$host_ip:9009/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
+
+  chatqna-xeon-backend-server:
+    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
+    container_name: chatqna-xeon-backend-server
+    depends_on:
+      - milvus-standalone
+      - tei-embedding-service
+      - dataprep-milvus-service
+      - retriever
+      - tei-reranking-service
+      - vllm-service
+    ports:
+      - "8888:8888"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=chatqna-xeon-backend-server
+      - EMBEDDING_SERVER_HOST_IP=tei-embedding-service
+      - RETRIEVER_SERVICE_HOST_IP=retriever
+      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-80}
+      - RERANK_SERVER_HOST_IP=tei-reranking-service
+      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
+      - LLM_SERVER_HOST_IP=vllm-service
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
+      - LLM_MODEL=${LLM_MODEL_ID}
+      - LOGFLAG=${LOGFLAG}
+    ipc: host
+    restart: always
+
+  chatqna-xeon-ui-server:
+    image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest}
+    container_name: chatqna-xeon-ui-server
+    depends_on:
+      - chatqna-xeon-backend-server
+    ports:
+      - "5173:5173"
+    ipc: host
+    restart: always
+
+  chatqna-xeon-nginx-server:
+    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
+    container_name: chatqna-xeon-nginx-server
+    depends_on:
+      - chatqna-xeon-backend-server
+      - chatqna-xeon-ui-server
+    ports:
+      - "${NGINX_PORT:-80}:80"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - FRONTEND_SERVICE_IP=chatqna-xeon-ui-server
+      - FRONTEND_SERVICE_PORT=5173
+      - BACKEND_SERVICE_NAME=chatqna
+      - BACKEND_SERVICE_IP=chatqna-xeon-backend-server
+      - BACKEND_SERVICE_PORT=8888
+      - DATAPREP_SERVICE_IP=dataprep-milvus-service
+      - DATAPREP_SERVICE_PORT=5000
+    ipc: host
+    restart: always
+
+
+networks:
+  default:
+    driver: bridge
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml
@@ -28,7 +28,7 @@ services:
    ports:
      - "6006:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -59,7 +59,7 @@ services:
    ports:
      - "8808:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -75,7 +75,7 @@ services:
    ports:
      - "9009:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
@@ -32,7 +32,7 @@ services:
    ports:
      - "6040:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
    ports:
      - "6041:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -80,7 +80,7 @@ services:
    ports:
      - "6042:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.telemetry.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.telemetry.yaml
@@ -0,0 +1,29 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  tei-embedding-service:
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  tei-reranking-service:
+    command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  tgi-service:
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  jaeger:
+    image: jaegertracing/all-in-one:latest
+    container_name: jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
+      - "9411:9411"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      COLLECTOR_ZIPKIN_HOST_PORT: 9411
+    restart: unless-stopped
+  chatqna-xeon-backend-server:
+    environment:
+      - ENABLE_OPEA_TELEMETRY=true
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "6006:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
    ports:
      - "8808:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -80,7 +80,7 @@ services:
    ports:
      - "9009:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "6006:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
    ports:
      - "9009:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 128g
    environment:
      no_proxy: ${no_proxy}
--- a/ChatQnA/docker_compose/intel/cpu/xeon/milvus.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/milvus.yaml
@@ -0,0 +1,811 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Licensed to the LF AI & Data foundation under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Related configuration of etcd, used to store Milvus metadata & service discovery.
+etcd:
+  endpoints: localhost:2379
+  rootPath: by-dev # The root path where data is stored in etcd
+  metaSubPath: meta # metaRootPath = rootPath + '/' + metaSubPath
+  kvSubPath: kv # kvRootPath = rootPath + '/' + kvSubPath
+  log:
+    level: info # Only supports debug, info, warn, error, panic, or fatal. Default 'info'.
+    # path is one of:
+    #  - "default" as os.Stderr,
+    #  - "stderr" as os.Stderr,
+    #  - "stdout" as os.Stdout,
+    #  - file path to append server logs to.
+    # please adjust in embedded Milvus: /tmp/milvus/logs/etcd.log
+    path: stdout
+  ssl:
+    enabled: false # Whether to support ETCD secure connection mode
+    tlsCert: /path/to/etcd-client.pem # path to your cert file
+    tlsKey: /path/to/etcd-client-key.pem # path to your key file
+    tlsCACert: /path/to/ca.pem # path to your CACert file
+    # TLS min version
+    # Optional values: 1.0, 1.1, 1.2, 1.3。
+    # We recommend using version 1.2 and above.
+    tlsMinVersion: 1.3
+  requestTimeout: 10000 # Etcd operation timeout in milliseconds
+  use:
+    embed: false # Whether to enable embedded Etcd (an in-process EtcdServer).
+  data:
+    dir: default.etcd # Embedded Etcd only. please adjust in embedded Milvus: /tmp/milvus/etcdData/
+  auth:
+    enabled: false # Whether to enable authentication
+    userName: # username for etcd authentication
+    password: # password for etcd authentication
+
+metastore:
+  type: etcd # Default value: etcd, Valid values: [etcd, tikv]
+
+# Related configuration of tikv, used to store Milvus metadata.
+# Notice that when TiKV is enabled for metastore, you still need to have etcd for service discovery.
+# TiKV is a good option when the metadata size requires better horizontal scalability.
+tikv:
+  endpoints: 127.0.0.1:2389 # Note that the default pd port of tikv is 2379, which conflicts with etcd.
+  rootPath: by-dev # The root path where data is stored in tikv
+  metaSubPath: meta # metaRootPath = rootPath + '/' + metaSubPath
+  kvSubPath: kv # kvRootPath = rootPath + '/' + kvSubPath
+  requestTimeout: 10000 # ms, tikv request timeout
+  snapshotScanSize: 256 # batch size of tikv snapshot scan
+  ssl:
+    enabled: false # Whether to support TiKV secure connection mode
+    tlsCert: # path to your cert file
+    tlsKey: # path to your key file
+    tlsCACert: # path to your CACert file
+
+localStorage:
+  path: /var/lib/milvus/data/ # please adjust in embedded Milvus: /tmp/milvus/data/
+
+# Related configuration of MinIO/S3/GCS or any other service supports S3 API, which is responsible for data persistence for Milvus.
+# We refer to the storage service as MinIO/S3 in the following description for simplicity.
+minio:
+  address: localhost # Address of MinIO/S3
+  port: 9000 # Port of MinIO/S3
+  accessKeyID: minioadmin # accessKeyID of MinIO/S3
+  secretAccessKey: minioadmin # MinIO/S3 encryption string
+  useSSL: false # Access to MinIO/S3 with SSL
+  ssl:
+    tlsCACert: /path/to/public.crt # path to your CACert file
+  bucketName: a-bucket # Bucket name in MinIO/S3
+  rootPath: files # The root path where the message is stored in MinIO/S3
+  # Whether to useIAM role to access S3/GCS instead of access/secret keys
+  # For more information, refer to
+  # aws: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use.html
+  # gcp: https://cloud.google.com/storage/docs/access-control/iam
+  # aliyun (ack): https://www.alibabacloud.com/help/en/container-service-for-kubernetes/latest/use-rrsa-to-enforce-access-control
+  # aliyun (ecs): https://www.alibabacloud.com/help/en/elastic-compute-service/latest/attach-an-instance-ram-role
+  useIAM: false
+  # Cloud Provider of S3. Supports: "aws", "gcp", "aliyun".
+  # You can use "aws" for other cloud provider supports S3 API with signature v4, e.g.: minio
+  # You can use "gcp" for other cloud provider supports S3 API with signature v2
+  # You can use "aliyun" for other cloud provider uses virtual host style bucket
+  # When useIAM enabled, only "aws", "gcp", "aliyun" is supported for now
+  cloudProvider: aws
+  # Custom endpoint for fetch IAM role credentials. when useIAM is true & cloudProvider is "aws".
+  # Leave it empty if you want to use AWS default endpoint
+  iamEndpoint:
+  logLevel: fatal # Log level for aws sdk log. Supported level:  off, fatal, error, warn, info, debug, trace
+  region: # Specify minio storage system location region
+  useVirtualHost: false # Whether use virtual host mode for bucket
+  requestTimeoutMs: 10000 # minio timeout for request time in milliseconds
+  # The maximum number of objects requested per batch in minio ListObjects rpc,
+  # 0 means using oss client by default, decrease these configuration if ListObjects timeout
+  listObjectsMaxKeys: 0
+
+# Milvus supports four MQ: rocksmq(based on RockDB), natsmq(embedded nats-server), Pulsar and Kafka.
+# You can change your mq by setting mq.type field.
+# If you don't set mq.type field as default, there is a note about enabling priority if we config multiple mq in this file.
+# 1. standalone(local) mode: rocksmq(default) > natsmq > Pulsar > Kafka
+# 2. cluster mode:  Pulsar(default) > Kafka (rocksmq and natsmq is unsupported in cluster mode)
+mq:
+  # Default value: "default"
+  # Valid values: [default, pulsar, kafka, rocksmq, natsmq]
+  type: default
+  enablePursuitMode: true # Default value: "true"
+  pursuitLag: 10 # time tick lag threshold to enter pursuit mode, in seconds
+  pursuitBufferSize: 8388608 # pursuit mode buffer size in bytes
+  mqBufSize: 16 # MQ client consumer buffer length
+  dispatcher:
+    mergeCheckInterval: 1 # the interval time(in seconds) for dispatcher to check whether to merge
+    targetBufSize: 16 # the length of channel buffer for targe
+    maxTolerantLag: 3 # Default value: "3", the timeout(in seconds) that target sends msgPack
+
+# Related configuration of pulsar, used to manage Milvus logs of recent mutation operations, output streaming log, and provide log publish-subscribe services.
+pulsar:
+  address: localhost # Address of pulsar
+  port: 6650 # Port of Pulsar
+  webport: 80 # Web port of pulsar, if you connect directly without proxy, should use 8080
+  maxMessageSize: 5242880 # 5 * 1024 * 1024 Bytes, Maximum size of each message in pulsar.
+  tenant: public
+  namespace: default
+  requestTimeout: 60 # pulsar client global request timeout in seconds
+  enableClientMetrics: false # Whether to register pulsar client metrics into milvus metrics path.
+
+# If you want to enable kafka, needs to comment the pulsar configs
+# kafka:
+#   brokerList:
+#   saslUsername:
+#   saslPassword:
+#   saslMechanisms:
+#   securityProtocol:
+#   ssl:
+#     enabled: false # whether to enable ssl mode
+#     tlsCert:  # path to client's public key (PEM) used for authentication
+#     tlsKey:  # path to client's private key (PEM) used for authentication
+#     tlsCaCert:  # file or directory path to CA certificate(s) for verifying the broker's key
+#     tlsKeyPassword:  # private key passphrase for use with ssl.key.location and set_ssl_cert(), if any
+#   readTimeout: 10
+
+rocksmq:
+  # The path where the message is stored in rocksmq
+  # please adjust in embedded Milvus: /tmp/milvus/rdb_data
+  path: /var/lib/milvus/rdb_data
+  lrucacheratio: 0.06 # rocksdb cache memory ratio
+  rocksmqPageSize: 67108864 # 64 MB, 64 * 1024 * 1024 bytes, The size of each page of messages in rocksmq
+  retentionTimeInMinutes: 4320 # 3 days, 3 * 24 * 60 minutes, The retention time of the message in rocksmq.
+  retentionSizeInMB: 8192 # 8 GB, 8 * 1024 MB, The retention size of the message in rocksmq.
+  compactionInterval: 86400 # 1 day, trigger rocksdb compaction every day to remove deleted data
+  compressionTypes: 0,0,7,7,7 # compaction compression type, only support use 0,7. 0 means not compress, 7 will use zstd. Length of types means num of rocksdb level.
+
+# natsmq configuration.
+# more detail: https://docs.nats.io/running-a-nats-service/configuration
+natsmq:
+  server:
+    port: 4222 # Port for nats server listening
+    storeDir: /var/lib/milvus/nats # Directory to use for JetStream storage of nats
+    maxFileStore: 17179869184 # Maximum size of the 'file' storage
+    maxPayload: 8388608 # Maximum number of bytes in a message payload
+    maxPending: 67108864 # Maximum number of bytes buffered for a connection Applies to client connections
+    initializeTimeout: 4000 # waiting for initialization of natsmq finished
+    monitor:
+      trace: false # If true enable protocol trace log messages
+      debug: false # If true enable debug log messages
+      logTime: true # If set to false, log without timestamps.
+      logFile: /tmp/milvus/logs/nats.log # Log file path relative to .. of milvus binary if use relative path
+      logSizeLimit: 536870912 # Size in bytes after the log file rolls over to a new one
+    retention:
+      maxAge: 4320 # Maximum age of any message in the P-channel
+      maxBytes: # How many bytes the single P-channel may contain. Removing oldest messages if the P-channel exceeds this size
+      maxMsgs: # How many message the single P-channel may contain. Removing oldest messages if the P-channel exceeds this limit
+
+# Related configuration of rootCoord, used to handle data definition language (DDL) and data control language (DCL) requests
+rootCoord:
+  dmlChannelNum: 16 # The number of dml channels created at system startup
+  maxPartitionNum: 1024 # Maximum number of partitions in a collection
+  minSegmentSizeToEnableIndex: 1024 # It's a threshold. When the segment size is less than this value, the segment will not be indexed
+  enableActiveStandby: false
+  maxDatabaseNum: 64 # Maximum number of database
+  maxGeneralCapacity: 65536 # upper limit for the sum of of product of partitionNumber and shardNumber
+  gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
+  ip: # if not specified, use the first unicastable address
+  port: 53100
+  grpc:
+    serverMaxSendSize: 536870912
+    serverMaxRecvSize: 268435456
+    clientMaxSendSize: 268435456
+    clientMaxRecvSize: 536870912
+
+# Related configuration of proxy, used to validate client requests and reduce the returned results.
+proxy:
+  timeTickInterval: 200 # ms, the interval that proxy synchronize the time tick
+  healthCheckTimeout: 3000 # ms, the interval that to do component healthy check
+  msgStream:
+    timeTick:
+      bufSize: 512
+  maxNameLength: 255 # Maximum length of name for a collection or alias
+  # Maximum number of fields in a collection.
+  # As of today (2.2.0 and after) it is strongly DISCOURAGED to set maxFieldNum >= 64.
+  # So adjust at your risk!
+  maxFieldNum: 64
+  maxVectorFieldNum: 4 # Maximum number of vector fields in a collection.
+  maxShardNum: 16 # Maximum number of shards in a collection
+  maxDimension: 32768 # Maximum dimension of a vector
+  # Whether to produce gin logs.\n
+  # please adjust in embedded Milvus: false
+  ginLogging: true
+  ginLogSkipPaths: / # skip url path for gin log
+  maxTaskNum: 1024 # max task number of proxy task queue
+  mustUsePartitionKey: false # switch for whether proxy must use partition key for the collection
+  accessLog:
+    enable: false # if use access log
+    minioEnable: false # if upload sealed access log file to minio
+    localPath: /tmp/milvus_access
+    filename: # Log filename, leave empty to use stdout.
+    maxSize: 64 # Max size for a single file, in MB.
+    cacheSize: 10240 # Size of log of memory cache, in B
+    rotatedTime: 0 # Max time for single access log file in seconds
+    remotePath: access_log/ # File path in minIO
+    remoteMaxTime: 0 # Max time for log file in minIO, in hours
+    formatters:
+      base:
+        format: "[$time_now] [ACCESS] <$user_name: $user_addr> $method_name [status: $method_status] [code: $error_code] [sdk: $sdk_version] [msg: $error_msg] [traceID: $trace_id] [timeCost: $time_cost]"
+      query:
+        format: "[$time_now] [ACCESS] <$user_name: $user_addr> $method_name [status: $method_status] [code: $error_code] [sdk: $sdk_version] [msg: $error_msg] [traceID: $trace_id] [timeCost: $time_cost] [database: $database_name] [collection: $collection_name] [partitions: $partition_name] [expr: $method_expr]"
+        methods: "Query,Search,Delete"
+  connectionCheckIntervalSeconds: 120 # the interval time(in seconds) for connection manager to scan inactive client info
+  connectionClientInfoTTLSeconds: 86400 # inactive client info TTL duration, in seconds
+  maxConnectionNum: 10000 # the max client info numbers that proxy should manage, avoid too many client infos
+  gracefulStopTimeout: 30 # seconds. force stop node without graceful stop
+  slowQuerySpanInSeconds: 5 # query whose executed time exceeds the `slowQuerySpanInSeconds` can be considered slow, in seconds.
+  http:
+    enabled: true # Whether to enable the http server
+    debug_mode: false # Whether to enable http server debug mode
+    port: # high-level restful api
+    acceptTypeAllowInt64: true # high-level restful api, whether http client can deal with int64
+    enablePprof: true # Whether to enable pprof middleware on the metrics port
+  ip: # if not specified, use the first unicastable address
+  port: 19530
+  internalPort: 19529
+  grpc:
+    serverMaxSendSize: 268435456
+    serverMaxRecvSize: 67108864
+    clientMaxSendSize: 268435456
+    clientMaxRecvSize: 67108864
+
+# Related configuration of queryCoord, used to manage topology and load balancing for the query nodes, and handoff from growing segments to sealed segments.
+queryCoord:
+  taskMergeCap: 1
+  taskExecutionCap: 256
+  autoHandoff: true # Enable auto handoff
+  autoBalance: true # Enable auto balance
+  autoBalanceChannel: true # Enable auto balance channel
+  balancer: ScoreBasedBalancer # auto balancer used for segments on queryNodes
+  globalRowCountFactor: 0.1 # the weight used when balancing segments among queryNodes
+  scoreUnbalanceTolerationFactor: 0.05 # the least value for unbalanced extent between from and to nodes when doing balance
+  reverseUnBalanceTolerationFactor: 1.3 # the largest value for unbalanced extent between from and to nodes after doing balance
+  overloadedMemoryThresholdPercentage: 90 # The threshold percentage that memory overload
+  balanceIntervalSeconds: 60
+  memoryUsageMaxDifferencePercentage: 30
+  rowCountFactor: 0.4 # the row count weight used when balancing segments among queryNodes
+  segmentCountFactor: 0.4 # the segment count weight used when balancing segments among queryNodes
+  globalSegmentCountFactor: 0.1 # the segment count weight used when balancing segments among queryNodes
+  segmentCountMaxSteps: 50 # segment count based plan generator max steps
+  rowCountMaxSteps: 50 # segment count based plan generator max steps
+  randomMaxSteps: 10 # segment count based plan generator max steps
+  growingRowCountWeight: 4 # the memory weight of growing segment row count
+  balanceCostThreshold: 0.001 # the threshold of balance cost, if the difference of cluster's cost after executing the balance plan is less than this value, the plan will not be executed
+  checkSegmentInterval: 1000
+  checkChannelInterval: 1000
+  checkBalanceInterval: 10000
+  checkIndexInterval: 10000
+  channelTaskTimeout: 60000 # 1 minute
+  segmentTaskTimeout: 120000 # 2 minute
+  distPullInterval: 500
+  collectionObserverInterval: 200
+  checkExecutedFlagInterval: 100
+  heartbeatAvailableInterval: 10000 # 10s, Only QueryNodes which fetched heartbeats within the duration are available
+  loadTimeoutSeconds: 600
+  distRequestTimeout: 5000 # the request timeout for querycoord fetching data distribution from querynodes, in milliseconds
+  heatbeatWarningLag: 5000 # the lag value for querycoord report warning when last heartbeat is too old, in milliseconds
+  checkHandoffInterval: 5000
+  enableActiveStandby: false
+  checkInterval: 1000
+  checkHealthInterval: 3000 # 3s, the interval when query coord try to check health of query node
+  checkHealthRPCTimeout: 2000 # 100ms, the timeout of check health rpc to query node
+  brokerTimeout: 5000 # 5000ms, querycoord broker rpc timeout
+  collectionRecoverTimes: 3 # if collection recover times reach the limit during loading state, release it
+  observerTaskParallel: 16 # the parallel observer dispatcher task number
+  checkAutoBalanceConfigInterval: 10 # the interval of check auto balance config
+  checkNodeSessionInterval: 60 # the interval(in seconds) of check querynode cluster session
+  gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
+  enableStoppingBalance: true # whether enable stopping balance
+  channelExclusiveNodeFactor: 4 # the least node number for enable channel's exclusive mode
+  cleanExcludeSegmentInterval: 60 # the time duration of clean pipeline exclude segment which used for filter invalid data, in seconds
+  ip: # if not specified, use the first unicastable address
+  port: 19531
+  grpc:
+    serverMaxSendSize: 536870912
+    serverMaxRecvSize: 268435456
+    clientMaxSendSize: 268435456
+    clientMaxRecvSize: 536870912
+
+# Related configuration of queryNode, used to run hybrid search between vector and scalar data.
+queryNode:
+  stats:
+    publishInterval: 1000 # Interval for querynode to report node information (milliseconds)
+  segcore:
+    knowhereThreadPoolNumRatio: 4 # The number of threads in knowhere's thread pool. If disk is enabled, the pool size will multiply with knowhereThreadPoolNumRatio([1, 32]).
+    chunkRows: 128 # The number of vectors in a chunk.
+    interimIndex:
+      enableIndex: true # Enable segment build with index to accelerate vector search when segment is in growing or binlog.
+      nlist: 128 # temp index nlist, recommend to set sqrt(chunkRows), must smaller than chunkRows/8
+      nprobe: 16 # nprobe to search small index, based on your accuracy requirement, must smaller than nlist
+      memExpansionRate: 1.15 # extra memory needed by building interim index
+      buildParallelRate: 0.5 # the ratio of building interim index parallel matched with cpu num
+    knowhereScoreConsistency: false # Enable knowhere strong consistency score computation logic
+  loadMemoryUsageFactor: 1 # The multiply factor of calculating the memory usage while loading segments
+  enableDisk: false # enable querynode load disk index, and search on disk index
+  maxDiskUsagePercentage: 95
+  cache:
+    enabled: true
+    memoryLimit: 2147483648 # 2 GB, 2 * 1024 *1024 *1024
+    readAheadPolicy: willneed # The read ahead policy of chunk cache, options: `normal, random, sequential, willneed, dontneed`
+    # options: async, sync, disable.
+    # Specifies the necessity for warming up the chunk cache.
+    # 1. If set to "sync" or "async" the original vector data will be synchronously/asynchronously loaded into the
+    # chunk cache during the load process. This approach has the potential to substantially reduce query/search latency
+    # for a specific duration post-load, albeit accompanied by a concurrent increase in disk usage;
+    # 2. If set to "disable" original vector data will only be loaded into the chunk cache during search/query.
+    warmup: disable
+  mmap:
+    mmapEnabled: false # Enable mmap for loading data
+  lazyload:
+    enabled: false # Enable lazyload for loading data
+    waitTimeout: 30000 # max wait timeout duration in milliseconds before start to do lazyload search and retrieve
+    requestResourceTimeout: 5000 # max timeout in milliseconds for waiting request resource for lazy load, 5s by default
+    requestResourceRetryInterval: 2000 # retry interval in milliseconds for waiting request resource for lazy load, 2s by default
+    maxRetryTimes: 1 # max retry times for lazy load, 1 by default
+    maxEvictPerRetry: 1 # max evict count for lazy load, 1 by default
+  grouping:
+    enabled: true
+    maxNQ: 1000
+    topKMergeRatio: 20
+  scheduler:
+    receiveChanSize: 10240
+    unsolvedQueueSize: 10240
+    # maxReadConcurrentRatio is the concurrency ratio of read task (search task and query task).
+    # Max read concurrency would be the value of hardware.GetCPUNum * maxReadConcurrentRatio.
+    # It defaults to 2.0, which means max read concurrency would be the value of hardware.GetCPUNum * 2.
+    # Max read concurrency must greater than or equal to 1, and less than or equal to hardware.GetCPUNum * 100.
+    # (0, 100]
+    maxReadConcurrentRatio: 1
+    cpuRatio: 10 # ratio used to estimate read task cpu usage.
+    maxTimestampLag: 86400
+    scheduleReadPolicy:
+      # fifo: A FIFO queue support the schedule.
+      # user-task-polling:
+      # 	The user's tasks will be polled one by one and scheduled.
+      # 	Scheduling is fair on task granularity.
+      # 	The policy is based on the username for authentication.
+      # 	And an empty username is considered the same user.
+      # 	When there are no multi-users, the policy decay into FIFO"
+      name: fifo
+      taskQueueExpire: 60 # Control how long (many seconds) that queue retains since queue is empty
+      enableCrossUserGrouping: false # Enable Cross user grouping when using user-task-polling policy. (Disable it if user's task can not merge each other)
+      maxPendingTaskPerUser: 1024 # Max pending task per user in scheduler
+  dataSync:
+    flowGraph:
+      maxQueueLength: 16 # Maximum length of task queue in flowgraph
+      maxParallelism: 1024 # Maximum number of tasks executed in parallel in the flowgraph
+  enableSegmentPrune: false # use partition prune function on shard delegator
+  ip: # if not specified, use the first unicastable address
+  port: 21123
+  grpc:
+    serverMaxSendSize: 536870912
+    serverMaxRecvSize: 268435456
+    clientMaxSendSize: 268435456
+    clientMaxRecvSize: 536870912
+
+indexCoord:
+  bindIndexNodeMode:
+    enable: false
+    address: localhost:22930
+    withCred: false
+    nodeID: 0
+  segment:
+    minSegmentNumRowsToEnableIndex: 1024 # It's a threshold. When the segment num rows is less than this value, the segment will not be indexed
+
+indexNode:
+  scheduler:
+    buildParallel: 1
+  enableDisk: true # enable index node build disk vector index
+  maxDiskUsagePercentage: 95
+  ip: # if not specified, use the first unicastable address
+  port: 21121
+  grpc:
+    serverMaxSendSize: 536870912
+    serverMaxRecvSize: 268435456
+    clientMaxSendSize: 268435456
+    clientMaxRecvSize: 536870912
+
+dataCoord:
+  channel:
+    watchTimeoutInterval: 300 # Timeout on watching channels (in seconds). Datanode tickler update watch progress will reset timeout timer.
+    balanceWithRpc: true # Whether to enable balance with RPC, default to use etcd watch
+    legacyVersionWithoutRPCWatch: 2.4.1 # Datanodes <= this version are considered as legacy nodes, which doesn't have rpc based watch(). This is only used during rolling upgrade where legacy nodes won't get new channels
+    balanceSilentDuration: 300 # The duration after which the channel manager start background channel balancing
+    balanceInterval: 360 # The interval with which the channel manager check dml channel balance status
+    checkInterval: 1 # The interval in seconds with which the channel manager advances channel states
+    notifyChannelOperationTimeout: 5 # Timeout notifing channel operations (in seconds).
+  segment:
+    maxSize: 1024 # Maximum size of a segment in MB
+    diskSegmentMaxSize: 2048 # Maximum size of a segment in MB for collection which has Disk index
+    sealProportion: 0.12
+    assignmentExpiration: 2000 # The time of the assignment expiration in ms
+    allocLatestExpireAttempt: 200 # The time attempting to alloc latest lastExpire from rootCoord after restart
+    maxLife: 86400 # The max lifetime of segment in seconds, 24*60*60
+    # If a segment didn't accept dml records in maxIdleTime and the size of segment is greater than
+    # minSizeFromIdleToSealed, Milvus will automatically seal it.
+    # The max idle time of segment in seconds, 10*60.
+    maxIdleTime: 600
+    minSizeFromIdleToSealed: 16 # The min size in MB of segment which can be idle from sealed.
+    # The max number of binlog file for one segment, the segment will be sealed if
+    # the number of binlog file reaches to max value.
+    maxBinlogFileNumber: 32
+    smallProportion: 0.5 # The segment is considered as "small segment" when its # of rows is smaller than
+    # (smallProportion * segment max # of rows).
+    # A compaction will happen on small segments if the segment after compaction will have
+    compactableProportion: 0.85
+    # over (compactableProportion * segment max # of rows) rows.
+    # MUST BE GREATER THAN OR EQUAL TO <smallProportion>!!!
+    # During compaction, the size of segment # of rows is able to exceed segment max # of rows by (expansionRate-1) * 100%.
+    expansionRate: 1.25
+  autoUpgradeSegmentIndex: false # whether auto upgrade segment index to index engine's version
+  enableCompaction: true # Enable data segment compaction
+  compaction:
+    enableAutoCompaction: true
+    indexBasedCompaction: true
+    rpcTimeout: 10
+    maxParallelTaskNum: 10
+    workerMaxParallelTaskNum: 2
+    levelzero:
+      forceTrigger:
+        minSize: 8388608 # The minimum size in bytes to force trigger a LevelZero Compaction, default as 8MB
+        maxSize: 67108864 # The maxmum size in bytes to force trigger a LevelZero Compaction, default as 64MB
+        deltalogMinNum: 10 # The minimum number of deltalog files to force trigger a LevelZero Compaction
+        deltalogMaxNum: 30 # The maxmum number of deltalog files to force trigger a LevelZero Compaction, default as 30
+  enableGarbageCollection: true
+  gc:
+    interval: 3600 # gc interval in seconds
+    missingTolerance: 86400 # file meta missing tolerance duration in seconds, default to 24hr(1d)
+    dropTolerance: 10800 # file belongs to dropped entity tolerance duration in seconds. 3600
+    removeConcurrent: 32 # number of concurrent goroutines to remove dropped s3 objects
+    scanInterval: 168 # garbage collection scan residue interval in hours
+  enableActiveStandby: false
+  brokerTimeout: 5000 # 5000ms, dataCoord broker rpc timeout
+  autoBalance: true # Enable auto balance
+  checkAutoBalanceConfigInterval: 10 # the interval of check auto balance config
+  import:
+    filesPerPreImportTask: 2 # The maximum number of files allowed per pre-import task.
+    taskRetention: 10800 # The retention period in seconds for tasks in the Completed or Failed state.
+    maxSizeInMBPerImportTask: 6144 # To prevent generating of small segments, we will re-group imported files. This parameter represents the sum of file sizes in each group (each ImportTask).
+    scheduleInterval: 2 # The interval for scheduling import, measured in seconds.
+    checkIntervalHigh: 2 # The interval for checking import, measured in seconds, is set to a high frequency for the import checker.
+    checkIntervalLow: 120 # The interval for checking import, measured in seconds, is set to a low frequency for the import checker.
+    maxImportFileNumPerReq: 1024 # The maximum number of files allowed per single import request.
+    waitForIndex: true # Indicates whether the import operation waits for the completion of index building.
+  gracefulStopTimeout: 5 # seconds. force stop node without graceful stop
+  ip: # if not specified, use the first unicastable address
+  port: 13333
+  grpc:
+    serverMaxSendSize: 536870912
+    serverMaxRecvSize: 268435456
+    clientMaxSendSize: 268435456
+    clientMaxRecvSize: 536870912
+
+dataNode:
+  dataSync:
+    flowGraph:
+      maxQueueLength: 16 # Maximum length of task queue in flowgraph
+      maxParallelism: 1024 # Maximum number of tasks executed in parallel in the flowgraph
+    maxParallelSyncMgrTasks: 256 # The max concurrent sync task number of datanode sync mgr globally
+    skipMode:
+      enable: true # Support skip some timetick message to reduce CPU usage
+      skipNum: 4 # Consume one for every n records skipped
+      coldTime: 60 # Turn on skip mode after there are only timetick msg for x seconds
+  segment:
+    insertBufSize: 16777216 # Max buffer size to flush for a single segment.
+    deleteBufBytes: 16777216 # Max buffer size in bytes to flush del for a single channel, default as 16MB
+    syncPeriod: 600 # The period to sync segments if buffer is not empty.
+  memory:
+    forceSyncEnable: true # Set true to force sync if memory usage is too high
+    forceSyncSegmentNum: 1 # number of segments to sync, segments with top largest buffer will be synced.
+    checkInterval: 3000 # the interval to check datanode memory usage, in milliseconds
+    forceSyncWatermark: 0.5 # memory watermark for standalone, upon reaching this watermark, segments will be synced.
+  timetick:
+    byRPC: true
+    interval: 500
+  channel:
+    # specify the size of global work pool of all channels
+    # if this parameter <= 0, will set it as the maximum number of CPUs that can be executing
+    # suggest to set it bigger on large collection numbers to avoid blocking
+    workPoolSize: -1
+    # specify the size of global work pool for channel checkpoint updating
+    # if this parameter <= 0, will set it as 10
+    updateChannelCheckpointMaxParallel: 10
+    updateChannelCheckpointInterval: 60 # the interval duration(in seconds) for datanode to update channel checkpoint of each channel
+    updateChannelCheckpointRPCTimeout: 20 # timeout in seconds for UpdateChannelCheckpoint RPC call
+    maxChannelCheckpointsPerPRC: 128 # The maximum number of channel checkpoints per UpdateChannelCheckpoint RPC.
+    channelCheckpointUpdateTickInSeconds: 10 # The frequency, in seconds, at which the channel checkpoint updater executes updates.
+  import:
+    maxConcurrentTaskNum: 16 # The maximum number of import/pre-import tasks allowed to run concurrently on a datanode.
+    maxImportFileSizeInGB: 16 # The maximum file size (in GB) for an import file, where an import file refers to either a Row-Based file or a set of Column-Based files.
+    readBufferSizeInMB: 16 # The data block size (in MB) read from chunk manager by the datanode during import.
+  compaction:
+    levelZeroBatchMemoryRatio: 0.05 # The minimal memory ratio of free memory for level zero compaction executing in batch mode
+  gracefulStopTimeout: 1800 # seconds. force stop node without graceful stop
+  ip: # if not specified, use the first unicastable address
+  port: 21124
+  grpc:
+    serverMaxSendSize: 536870912
+    serverMaxRecvSize: 268435456
+    clientMaxSendSize: 268435456
+    clientMaxRecvSize: 536870912
+
+# Configures the system log output.
+log:
+  level: info # Only supports debug, info, warn, error, panic, or fatal. Default 'info'.
+  file:
+    rootPath: # root dir path to put logs, default "" means no log file will print. please adjust in embedded Milvus: /tmp/milvus/logs
+    maxSize: 300 # MB
+    maxAge: 10 # Maximum time for log retention in day.
+    maxBackups: 20
+  format: text # text or json
+  stdout: true # Stdout enable or not
+
+grpc:
+  log:
+    level: WARNING
+  gracefulStopTimeout: 10 # second, time to wait graceful stop finish
+  client:
+    compressionEnabled: false
+    dialTimeout: 200
+    keepAliveTime: 10000
+    keepAliveTimeout: 20000
+    maxMaxAttempts: 10
+    initialBackoff: 0.2
+    maxBackoff: 10
+    minResetInterval: 1000
+    maxCancelError: 32
+    minSessionCheckInterval: 200
+
+# Configure the proxy tls enable.
+tls:
+  serverPemPath: configs/cert/server.pem
+  serverKeyPath: configs/cert/server.key
+  caPemPath: configs/cert/ca.pem
+
+common:
+  defaultPartitionName: _default # default partition name for a collection
+  defaultIndexName: _default_idx # default index name
+  entityExpiration: -1 # Entity expiration in seconds, CAUTION -1 means never expire
+  indexSliceSize: 16 # MB
+  threadCoreCoefficient:
+    highPriority: 10 # This parameter specify how many times the number of threads is the number of cores in high priority pool
+    middlePriority: 5 # This parameter specify how many times the number of threads is the number of cores in middle priority pool
+    lowPriority: 1 # This parameter specify how many times the number of threads is the number of cores in low priority pool
+  buildIndexThreadPoolRatio: 0.75
+  DiskIndex:
+    MaxDegree: 56
+    SearchListSize: 100
+    PQCodeBudgetGBRatio: 0.125
+    BuildNumThreadsRatio: 1
+    SearchCacheBudgetGBRatio: 0.1
+    LoadNumThreadRatio: 8
+    BeamWidthRatio: 4
+  gracefulTime: 5000 # milliseconds. it represents the interval (in ms) by which the request arrival time needs to be subtracted in the case of Bounded Consistency.
+  gracefulStopTimeout: 1800 # seconds. it will force quit the server if the graceful stop process is not completed during this time.
+  storageType: remote # please adjust in embedded Milvus: local, available values are [local, remote, opendal], value minio is deprecated, use remote instead
+  # Default value: auto
+  # Valid values: [auto, avx512, avx2, avx, sse4_2]
+  # This configuration is only used by querynode and indexnode, it selects CPU instruction set for Searching and Index-building.
+  simdType: auto
+  security:
+    authorizationEnabled: false
+    # The superusers will ignore some system check processes,
+    # like the old password verification when updating the credential
+    superUsers:
+    tlsMode: 0
+  session:
+    ttl: 30 # ttl value when session granting a lease to register service
+    retryTimes: 30 # retry times when session sending etcd requests
+  locks:
+    metrics:
+      enable: false # whether gather statistics for metrics locks
+    threshold:
+      info: 500 # minimum milliseconds for printing durations in info level
+      warn: 1000 # minimum milliseconds for printing durations in warn level
+  storage:
+    scheme: s3
+    enablev2: false
+  ttMsgEnabled: true # Whether the instance disable sending ts messages
+  traceLogMode: 0 # trace request info
+  bloomFilterSize: 100000 # bloom filter initial size
+  maxBloomFalsePositive: 0.001 # max false positive rate for bloom filter
+
+# QuotaConfig, configurations of Milvus quota and limits.
+# By default, we enable:
+#   1. TT protection;
+#   2. Memory protection.
+#   3. Disk quota protection.
+# You can enable:
+#   1. DML throughput limitation;
+#   2. DDL, DQL qps/rps limitation;
+#   3. DQL Queue length/latency protection;
+#   4. DQL result rate protection;
+# If necessary, you can also manually force to deny RW requests.
+quotaAndLimits:
+  enabled: true # `true` to enable quota and limits, `false` to disable.
+  # quotaCenterCollectInterval is the time interval that quotaCenter
+  # collects metrics from Proxies, Query cluster and Data cluster.
+  # seconds, (0 ~ 65536)
+  quotaCenterCollectInterval: 3
+  ddl:
+    enabled: false
+    collectionRate: -1 # qps, default no limit, rate for CreateCollection, DropCollection, LoadCollection, ReleaseCollection
+    partitionRate: -1 # qps, default no limit, rate for CreatePartition, DropPartition, LoadPartition, ReleasePartition
+    db:
+      collectionRate: -1 # qps of db level , default no limit, rate for CreateCollection, DropCollection, LoadCollection, ReleaseCollection
+      partitionRate: -1 # qps of db level, default no limit, rate for CreatePartition, DropPartition, LoadPartition, ReleasePartition
+  indexRate:
+    enabled: false
+    max: -1 # qps, default no limit, rate for CreateIndex, DropIndex
+    db:
+      max: -1 # qps of db level, default no limit, rate for CreateIndex, DropIndex
+  flushRate:
+    enabled: true
+    max: -1 # qps, default no limit, rate for flush
+    collection:
+      max: 0.1 # qps, default no limit, rate for flush at collection level.
+    db:
+      max: -1 # qps of db level, default no limit, rate for flush
+  compactionRate:
+    enabled: false
+    max: -1 # qps, default no limit, rate for manualCompaction
+    db:
+      max: -1 # qps of db level, default no limit, rate for manualCompaction
+  dml:
+    # dml limit rates, default no limit.
+    # The maximum rate will not be greater than max.
+    enabled: false
+    insertRate:
+      max: -1 # MB/s, default no limit
+      db:
+        max: -1 # MB/s, default no limit
+      collection:
+        max: -1 # MB/s, default no limit
+      partition:
+        max: -1 # MB/s, default no limit
+    upsertRate:
+      max: -1 # MB/s, default no limit
+      db:
+        max: -1 # MB/s, default no limit
+      collection:
+        max: -1 # MB/s, default no limit
+      partition:
+        max: -1 # MB/s, default no limit
+    deleteRate:
+      max: -1 # MB/s, default no limit
+      db:
+        max: -1 # MB/s, default no limit
+      collection:
+        max: -1 # MB/s, default no limit
+      partition:
+        max: -1 # MB/s, default no limit
+    bulkLoadRate:
+      max: -1 # MB/s, default no limit, not support yet. TODO: limit bulkLoad rate
+      db:
+        max: -1 # MB/s, default no limit, not support yet. TODO: limit db bulkLoad rate
+      collection:
+        max: -1 # MB/s, default no limit, not support yet. TODO: limit collection bulkLoad rate
+      partition:
+        max: -1 # MB/s, default no limit, not support yet. TODO: limit partition bulkLoad rate
+  dql:
+    # dql limit rates, default no limit.
+    # The maximum rate will not be greater than max.
+    enabled: false
+    searchRate:
+      max: -1 # vps (vectors per second), default no limit
+      db:
+        max: -1 # vps (vectors per second), default no limit
+      collection:
+        max: -1 # vps (vectors per second), default no limit
+      partition:
+        max: -1 # vps (vectors per second), default no limit
+    queryRate:
+      max: -1 # qps, default no limit
+      db:
+        max: -1 # qps, default no limit
+      collection:
+        max: -1 # qps, default no limit
+      partition:
+        max: -1 # qps, default no limit
+  limits:
+    maxCollectionNum: 65536
+    maxCollectionNumPerDB: 65536
+    maxInsertSize: -1 # maximum size of a single insert request, in bytes, -1 means no limit
+    maxResourceGroupNumOfQueryNode: 1024 # maximum number of resource groups of query nodes
+  limitWriting:
+    # forceDeny false means dml requests are allowed (except for some
+    # specific conditions, such as memory of nodes to water marker), true means always reject all dml requests.
+    forceDeny: false
+    ttProtection:
+      enabled: false
+      # maxTimeTickDelay indicates the backpressure for DML Operations.
+      # DML rates would be reduced according to the ratio of time tick delay to maxTimeTickDelay,
+      # if time tick delay is greater than maxTimeTickDelay, all DML requests would be rejected.
+      # seconds
+      maxTimeTickDelay: 300
+    memProtection:
+      # When memory usage > memoryHighWaterLevel, all dml requests would be rejected;
+      # When memoryLowWaterLevel < memory usage < memoryHighWaterLevel, reduce the dml rate;
+      # When memory usage < memoryLowWaterLevel, no action.
+      enabled: true
+      dataNodeMemoryLowWaterLevel: 0.85 # (0, 1], memoryLowWaterLevel in DataNodes
+      dataNodeMemoryHighWaterLevel: 0.95 # (0, 1], memoryHighWaterLevel in DataNodes
+      queryNodeMemoryLowWaterLevel: 0.85 # (0, 1], memoryLowWaterLevel in QueryNodes
+      queryNodeMemoryHighWaterLevel: 0.95 # (0, 1], memoryHighWaterLevel in QueryNodes
+    growingSegmentsSizeProtection:
+      # No action will be taken if the growing segments size is less than the low watermark.
+      # When the growing segments size exceeds the low watermark, the dml rate will be reduced,
+      # but the rate will not be lower than minRateRatio * dmlRate.
+      enabled: false
+      minRateRatio: 0.5
+      lowWaterLevel: 0.2
+      highWaterLevel: 0.4
+    diskProtection:
+      enabled: true # When the total file size of object storage is greater than `diskQuota`, all dml requests would be rejected;
+      diskQuota: -1 # MB, (0, +inf), default no limit
+      diskQuotaPerDB: -1 # MB, (0, +inf), default no limit
+      diskQuotaPerCollection: -1 # MB, (0, +inf), default no limit
+      diskQuotaPerPartition: -1 # MB, (0, +inf), default no limit
+  limitReading:
+    # forceDeny false means dql requests are allowed (except for some
+    # specific conditions, such as collection has been dropped), true means always reject all dql requests.
+    forceDeny: false
+    queueProtection:
+      enabled: false
+      # nqInQueueThreshold indicated that the system was under backpressure for Search/Query path.
+      # If NQ in any QueryNode's queue is greater than nqInQueueThreshold, search&query rates would gradually cool off
+      # until the NQ in queue no longer exceeds nqInQueueThreshold. We think of the NQ of query request as 1.
+      # int, default no limit
+      nqInQueueThreshold: -1
+      # queueLatencyThreshold indicated that the system was under backpressure for Search/Query path.
+      # If dql latency of queuing is greater than queueLatencyThreshold, search&query rates would gradually cool off
+      # until the latency of queuing no longer exceeds queueLatencyThreshold.
+      # The latency here refers to the averaged latency over a period of time.
+      # milliseconds, default no limit
+      queueLatencyThreshold: -1
+    resultProtection:
+      enabled: false
+      # maxReadResultRate indicated that the system was under backpressure for Search/Query path.
+      # If dql result rate is greater than maxReadResultRate, search&query rates would gradually cool off
+      # until the read result rate no longer exceeds maxReadResultRate.
+      # MB/s, default no limit
+      maxReadResultRate: -1
+      maxReadResultRatePerDB: -1
+      maxReadResultRatePerCollection: -1
+    # colOffSpeed is the speed of search&query rates cool off.
+    # (0, 1]
+    coolOffSpeed: 0.9
+
+trace:
+  # trace exporter type, default is stdout,
+  # optional values: ['noop','stdout', 'jaeger', 'otlp']
+  exporter: noop
+  # fraction of traceID based sampler,
+  # optional values: [0, 1]
+  # Fractions >= 1 will always sample. Fractions < 0 are treated as zero.
+  sampleFraction: 0
+  jaeger:
+    url: # when exporter is jaeger should set the jaeger's URL
+  otlp:
+    endpoint: # example: "127.0.0.1:4318"
+    secure: true
+
+#when using GPU indexing, Milvus will utilize a memory pool to avoid frequent memory allocation and deallocation.
+#here, you can set the size of the memory occupied by the memory pool, with the unit being MB.
+#note that there is a possibility of Milvus crashing when the actual memory demand exceeds the value set by maxMemSize.
+#if initMemSize and MaxMemSize both set zero,
+#milvus will automatically initialize half of the available GPU memory,
+#maxMemSize will the whole available GPU memory.
+gpu:
+  initMemSize: # Gpu Memory Pool init size
+  maxMemSize: # Gpu Memory Pool Max size
--- a/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh
@@ -14,3 +14,7 @@ export INDEX_NAME="rag-redis"
 # Set it as a non-null string, such as true, if you want to enable logging facility,
 # otherwise, keep it as "" to disable it.
 export LOGFLAG=""
+# Set OpenTelemetry Tracing Endpoint
+export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
+export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -10,7 +10,7 @@ Quick Start:
 2. Run Docker Compose.
 3. Consume the ChatQnA Service.

-Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
+Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). We now support running the latest DeepSeek models, including [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) and [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) on Gaudi accelerators. To run `deepseek-ai/DeepSeek-R1-Distill-Llama-70B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 8 in the [set_env.sh](./set_env.sh) script. To run `deepseek-ai/DeepSeek-R1-Distill-Qwen-32B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 4 in the [set_env.sh](./set_env.sh) script.

 ## Quick Start: 1.Setup Environment Variable

@@ -39,12 +39,37 @@ To set up environment variables for deploying ChatQnA services, follow these ste
   source ./set_env.sh
   ```

+4. Change Model for LLM serving
+
+   By default, Meta-Llama-3-8B-Instruct is used for LLM serving, the default model can be changed to other validated LLM models.  
+   Please pick a [validated llm models](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/src/text-generation#validated-llm-models) from the table.  
+   To change the default model defined in set_env.sh, overwrite it by exporting LLM_MODEL_ID to the new model or by modifying set_env.sh, and then repeat step 3.  
+   For example, change to DeepSeek-R1-Distill-Qwen-32B using the following command.
+
+   ```bash
+   export LLM_MODEL_ID="deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
+   ```
+
+   Please also check [required gaudi cards for different models](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/src/text-generation#system-requirements-for-llm-models) for new models.  
+   It might be necessary to increase the number of Gaudi cards for the model by exporting NUM_CARDS to the new model or by modifying set_env.sh, and then repeating step 3. For example, increase the number of Gaudi cards for DeepSeek-R1-
+   Distill-Qwen-32B using the following command:
+
+   ```bash
+   export NUM_CARDS=4
+   ```
+
 ## Quick Start: 2.Run Docker Compose

 ```bash
 docker compose up -d
 ```

+To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
+
+```bash
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
+```
+
 It will automatically download the docker image on `docker hub`:

 ```bash
@@ -259,12 +284,16 @@ If use vLLM as the LLM serving backend.
 docker compose -f compose.yaml up -d
 # Start ChatQnA without Rerank Pipeline
 docker compose -f compose_without_rerank.yaml up -d
+# Start ChatQnA with Rerank Pipeline and Open Telemetry Tracing
+docker compose -f compose.yaml -f compose.telemetry.yaml up -d
 ```

 If use TGI as the LLM serving backend.

 ```bash
 docker compose -f compose_tgi.yaml up -d
+# Start ChatQnA with Open Telemetry Tracing
+docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml up -d
 ```

 If you want to enable guardrails microservice in the pipeline, please follow the below command instead:
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.telemetry.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.telemetry.yaml
@@ -0,0 +1,27 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  tei-embedding-service:
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  tei-reranking-service:
+    command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  jaeger:
+    image: jaegertracing/all-in-one:latest
+    container_name: jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
+      - "9411:9411"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      COLLECTOR_ZIPKIN_HOST_PORT: 9411
+    restart: unless-stopped
+  chatqna-gaudi-backend-server:
+    environment:
+      - ENABLE_OPEA_TELEMETRY=true
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "8090:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -62,7 +62,7 @@ services:
    ports:
      - "8808:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    runtime: habana
    cap_add:
      - SYS_NICE
@@ -83,7 +83,7 @@ services:
    ports:
      - "8007:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
@@ -92,6 +92,7 @@ services:
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      NUM_CARDS: ${NUM_CARDS}
      VLLM_TORCH_PROFILER_DIR: "/mnt"
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://$host_ip:8007/health || exit 1"]
@@ -102,7 +103,7 @@ services:
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
  chatqna-gaudi-backend-server:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-gaudi-backend-server
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "8088:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
@@ -70,7 +70,7 @@ services:
    ports:
      - "8090:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -103,7 +103,7 @@ services:
    ports:
      - "8808:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    runtime: habana
    cap_add:
      - SYS_NICE
@@ -124,7 +124,7 @@ services:
    ports:
      - "8008:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
@@ -133,12 +133,13 @@ services:
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      NUM_CARDS: ${NUM_CARDS}
      VLLM_TORCH_PROFILER_DIR: "/mnt"
    runtime: habana
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
  chatqna-gaudi-backend-server:
    image: ${REGISTRY:-opea}/chatqna-guardrails:${TAG:-latest}
    container_name: chatqna-gaudi-guardrails-server
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.telemetry.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.telemetry.yaml
@@ -0,0 +1,29 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  tei-embedding-service:
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  tei-reranking-service:
+    command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  tgi-service:
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+  jaeger:
+    image: jaegertracing/all-in-one:latest
+    container_name: jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
+      - "9411:9411"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      COLLECTOR_ZIPKIN_HOST_PORT: 9411
+    restart: unless-stopped
+  chatqna-gaudi-backend-server:
+    environment:
+      - ENABLE_OPEA_TELEMETRY=true
+      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -25,20 +25,19 @@ services:
      INDEX_NAME: ${INDEX_NAME}
      TEI_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      TELEMETRY_ENDPOINT: ${TELEMETRY_ENDPOINT}
  tei-embedding-service:
    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
    container_name: tei-embedding-gaudi-server
    ports:
      - "8090:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
  retriever:
    image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
    container_name: retriever-redis-server
@@ -56,7 +55,6 @@ services:
      INDEX_NAME: ${INDEX_NAME}
      TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      TELEMETRY_ENDPOINT: ${TELEMETRY_ENDPOINT}
      LOGFLAG: ${LOGFLAG}
      RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_REDIS"
    restart: unless-stopped
@@ -66,7 +64,7 @@ services:
    ports:
      - "8808:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    runtime: habana
    cap_add:
      - SYS_NICE
@@ -80,14 +78,14 @@ services:
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
      MAX_WARMUP_SEQUENCE_LENGTH: 512
-    command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+    command: --model-id ${RERANK_MODEL_ID} --auto-truncate
  tgi-service:
    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
    container_name: tgi-gaudi-server
    ports:
      - "8005:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
@@ -101,26 +99,12 @@ services:
      LIMIT_HPU_GRAPH: true
      USE_FLASH_ATTENTION: true
      FLASH_ATTENTION_RECOMPUTE: true
+      NUM_CARDS: ${NUM_CARDS}
    runtime: habana
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
-  jaeger:
-    image: jaegertracing/all-in-one:latest
-    container_name: jaeger
-    ports:
-      - "16686:16686"
-      - "4317:4317"
-      - "4318:4318"
-      - "9411:9411"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      COLLECTOR_ZIPKIN_HOST_PORT: 9411
-    restart: unless-stopped
+    command: --model-id ${LLM_MODEL_ID} --num-shard ${NUM_CARDS} --max-input-length 2048 --max-total-tokens 4096
  chatqna-gaudi-backend-server:
    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
    container_name: chatqna-gaudi-backend-server
@@ -146,7 +130,6 @@ services:
      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
      - LLM_MODEL=${LLM_MODEL_ID}
      - LOGFLAG=${LOGFLAG}
-      - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT}
    ipc: host
    restart: always
  chatqna-gaudi-ui-server:
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
@@ -31,7 +31,7 @@ services:
    ports:
      - "8090:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
@@ -64,7 +64,7 @@ services:
    ports:
      - "8007:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
@@ -73,12 +73,13 @@ services:
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      NUM_CARDS: ${NUM_CARDS}
      VLLM_TORCH_PROFILER_DIR: "/mnt"
    runtime: habana
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
  chatqna-gaudi-backend-server:
    image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
    container_name: chatqna-gaudi-backend-server
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -11,6 +11,7 @@ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export RERANK_MODEL_ID="BAAI/bge-reranker-base"
 export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
 export INDEX_NAME="rag-redis"
+export NUM_CARDS=1
 # Set it as a non-null string, such as true, if you want to enable logging facility,
 # otherwise, keep it as "" to disable it.
 export LOGFLAG=""
--- a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -29,7 +30,6 @@ function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
    git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
-    sed -i 's/triton/triton==3.1.0/g' vllm-fork/requirements-hpu.txt

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    service_list="chatqna-guardrails chatqna-ui dataprep retriever vllm-gaudi guardrails nginx"
@@ -47,6 +47,7 @@ function start_services() {
    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export NUM_CARDS=1
    export INDEX_NAME="rag-redis"
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
    export GURADRAILS_MODEL_ID="meta-llama/Meta-Llama-Guard-2-8B"
--- a/ChatQnA/tests/test_compose_milvus_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_milvus_on_xeon.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -e
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+export host_ip=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi
+
+    cd $WORKPATH/docker_image_build
+    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+    git clone https://github.com/vllm-project/vllm.git && cd vllm
+    VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+    echo "Check out vLLM tag ${VLLM_VER}"
+    git checkout ${VLLM_VER} &> /dev/null
+    # make sure NOT change the pwd
+    cd ../
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="chatqna chatqna-ui dataprep retriever vllm nginx"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+
+    docker images && sleep 1s
+}
+function start_services() {
+    cd $WORKPATH/docker_compose/intel/cpu/xeon/
+    export no_proxy=${no_proxy},${ip_address}
+    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
+    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+    export LOGFLAG=true
+
+    # Start Docker Containers
+    docker compose -f compose_milvus.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+    n=0
+    until [[ "$n" -ge 100 ]]; do
+        docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
+        if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
+            break
+        fi
+        sleep 5s
+        n=$((n+1))
+    done
+}
+
+function validate_service() {
+    local URL="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+    local DOCKER_NAME="$4"
+    local INPUT_DATA="$5"
+
+    if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then
+        cd $LOG_PATH
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
+    else
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+    fi
+    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+
+    docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+
+
+    # check response status
+    if [ "$HTTP_STATUS" -ne "200" ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+    echo "Response"
+    echo $RESPONSE_BODY
+    echo "Expected Result"
+    echo $EXPECTED_RESULT
+    # check response body
+    if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
+    fi
+
+    sleep 1s
+}
+
+function validate_microservices() {
+    # Check if the microservices are running correctly.
+
+    # tei for embedding service
+    validate_service \
+        "${ip_address}:6006/embed" \
+        "[[" \
+        "tei-embedding" \
+        "tei-embedding-server" \
+        '{"inputs":"What is Deep Learning?"}'
+
+    sleep 1m # retrieval can't curl as expected, try to wait for more time
+
+    # test /v1/dataprep/ingest upload file
+    echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
+    validate_service \
+       "http://${ip_address}:11101/v1/dataprep/ingest" \
+        "Data preparation succeeded" \
+        "dataprep_upload_file" \
+        "dataprep-milvus-server"
+
+    # test /v1/dataprep/delete
+    validate_service \
+       "http://${ip_address}:11101/v1/dataprep/delete" \
+       '{"status":true}' \
+        "dataprep_del" \
+        "dataprep-milvus-server"
+
+    # test /v1/dataprep/delete
+    validate_service \
+       "http://${ip_address}:11101/v1/dataprep/delete" \
+       '{"status":true}' \
+        "dataprep_del" \
+        "dataprep-milvus-server"
+
+
+    # retrieval microservice
+    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
+    validate_service \
+        "${ip_address}:7000/v1/retrieval" \
+        " " \
+        "retrieval" \
+        "retriever-milvus-server" \
+        "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
+
+    # tei for rerank microservice
+    echo "Validating reranking service"
+    validate_service \
+        "${ip_address}:8808/rerank" \
+        '{"index":1,"score":' \
+        "tei-rerank" \
+        "tei-reranking-server" \
+        '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
+
+
+    # tgi for llm service
+    echo "Validating llm service"
+    validate_service \
+        "${ip_address}:9009/v1/chat/completions" \
+        "content" \
+        "vllm-llm" \
+        "vllm-service" \
+        '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
+}
+
+function validate_megaservice() {
+    # Curl the Mega Service
+    validate_service \
+        "${ip_address}:8888/v1/chatqna" \
+        "data: " \
+        "chatqna-megaservice" \
+        "chatqna-xeon-backend-server" \
+        '{"messages": "What is the revenue of Nike in 2023?"}'
+
+}
+
+function validate_frontend() {
+    echo "[ TEST INFO ]: --------- frontend test started ---------"
+    cd $WORKPATH/ui/svelte
+    local conda_env_name="OPEA_e2e"
+    export PATH=${HOME}/miniforge3/bin/:$PATH
+    if conda info --envs | grep -q "$conda_env_name"; then
+        echo "$conda_env_name exist!"
+    else
+        conda create -n ${conda_env_name} python=3.12 -y
+    fi
+    source activate ${conda_env_name}
+    echo "[ TEST INFO ]: --------- conda env activated ---------"
+
+    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+
+    conda install -c conda-forge nodejs=22.6.0 -y
+    npm install && npm ci && npx playwright install --with-deps
+    node -v && npm -v && pip list
+
+    exit_status=0
+    npx playwright test || exit_status=$?
+
+    if [ $exit_status -ne 0 ]; then
+        echo "[TEST INFO]: ---------frontend test failed---------"
+        exit $exit_status
+    else
+        echo "[TEST INFO]: ---------frontend test passed---------"
+    fi
+}
+
+function stop_docker() {
+    echo "In stop docker"
+    echo $WORKPATH
+    cd $WORKPATH/docker_compose/intel/cpu/xeon/
+    docker compose -f compose_milvus.yaml down
+}
+
+function main() {
+
+    stop_docker
+
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+
+    start_time=$(date +%s)
+    start_services
+    end_time=$(date +%s)
+    duration=$((end_time-start_time))
+    echo "Mega service start duration is $duration s" && sleep 1s
+
+    validate_microservices
+    echo "==== microservices validated ===="
+    validate_megaservice
+    echo "==== megaservice validated ===="
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/ChatQnA/tests/test_compose_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -29,7 +30,6 @@ function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
    git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
-    sed -i 's/triton/triton==3.1.0/g' vllm-fork/requirements-hpu.txt

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    service_list="chatqna chatqna-ui dataprep retriever vllm-gaudi nginx"
@@ -45,12 +45,16 @@ function start_services() {
    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export NUM_CARDS=1
    export INDEX_NAME="rag-redis"
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
    export host_ip=${ip_address}
+    export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces

    # Start Docker Containers
-    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+    docker compose -f compose.yaml -f compose.telemetry.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
    until [[ "$n" -ge 160 ]]; do
        echo "n=$n"
@@ -134,7 +138,7 @@ function validate_megaservice() {
    # Curl the Mega Service
    validate_service \
        "${ip_address}:8888/v1/chatqna" \
-        "data:" \
+        "Nike" \
        "mega-chatqna" \
        "chatqna-gaudi-backend-server" \
        '{"messages": "What is the revenue of Nike in 2023?"}'
@@ -171,7 +175,7 @@ function validate_frontend() {

 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    docker compose -f compose.yaml down
+    docker compose -f compose.yaml -f compose.telemetry.yaml down
 }

 function main() {
@@ -186,7 +190,7 @@ function main() {

    validate_microservices
    validate_megaservice
-    # validate_frontend
+    validate_frontend

    stop_docker
    echo y | docker system prune
--- a/ChatQnA/tests/test_compose_on_rocm.sh
+++ b/ChatQnA/tests/test_compose_on_rocm.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"/var/opea/chatqna-service/data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -45,7 +46,7 @@ export CHATQNA_RERANK_SERVICE_HOST_IP=${HOST_IP}
 export CHATQNA_LLM_SERVICE_HOST_IP=${HOST_IP}
 export CHATQNA_NGINX_PORT=80
 export CHATQNA_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export PATH="/home/huggingface/miniconda3/bin:$PATH"
+export PATH="~/miniconda3/bin:$PATH"

 function build_docker_images() {
    opea_branch=${opea_branch:-"main"}
@@ -207,7 +208,7 @@ function validate_megaservice() {
    # Curl the Mega Service
    validate_service \
        "${ip_address}:8888/v1/chatqna" \
-        "data: " \
+        "Nike" \
        "chatqna-megaservice" \
        "chatqna-backend-server" \
        '{"messages": "What is the revenue of Nike in 2023?"}'
@@ -259,17 +260,12 @@ function main() {
    duration=$((end_time-start_time))
    echo "Mega service start duration is $duration s" && sleep 1s

-
-    if [ "${mode}" == "perf" ]; then
-        python3 "$WORKPATH"/tests/chatqna_benchmark.py
-    elif [ "${mode}" == "" ]; then
-        validate_microservices
-        echo "==== microservices validated ===="
-        validate_megaservice
-        echo "==== megaservice validated ===="
-        validate_frontend
-        echo "==== frontend validated ===="
-    fi
+    validate_microservices
+    echo "==== microservices validated ===="
+    validate_megaservice
+    echo "==== megaservice validated ===="
+    validate_frontend
+    echo "==== frontend validated ===="

    stop_docker
    echo y | docker system prune
--- a/ChatQnA/tests/test_compose_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -29,7 +30,12 @@ function build_docker_images() {

    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-    git clone --depth 1 https://github.com/vllm-project/vllm.git
+    git clone https://github.com/vllm-project/vllm.git && cd vllm
+    VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+    echo "Check out vLLM tag ${VLLM_VER}"
+    git checkout ${VLLM_VER} &> /dev/null
+    # make sure NOT change the pwd
+    cd ../

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    service_list="chatqna chatqna-ui dataprep retriever vllm nginx"
@@ -49,9 +55,12 @@ function start_services() {
    export INDEX_NAME="rag-redis"
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
    export host_ip=${ip_address}
+    export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces

    # Start Docker Containers
-    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+    docker compose -f compose.yaml -f compose.telemetry.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
    n=0
    until [[ "$n" -ge 100 ]]; do
        docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
@@ -93,6 +102,7 @@ function validate_service() {

 function validate_microservices() {
    # Check if the microservices are running correctly.
+    sleep 3m

    # tei for embedding service
    validate_service \
@@ -102,8 +112,6 @@ function validate_microservices() {
        "tei-embedding-server" \
        '{"inputs":"What is Deep Learning?"}'

-    sleep 1m # retrieval can't curl as expected, try to wait for more time
-
    # retrieval microservice
    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
    validate_service \
@@ -134,7 +142,7 @@ function validate_megaservice() {
    # Curl the Mega Service
    validate_service \
        "${ip_address}:8888/v1/chatqna" \
-        "data" \
+        "Nike" \
        "mega-chatqna" \
        "chatqna-xeon-backend-server" \
        '{"messages": "What is the revenue of Nike in 2023?"}'
@@ -172,7 +180,7 @@ function validate_frontend() {

 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/cpu/xeon
-    docker compose -f compose.yaml down
+    docker compose -f compose.yaml -f compose.telemetry.yaml down
 }

 function main() {
@@ -185,13 +193,9 @@ function main() {
    duration=$((end_time-start_time))
    echo "Mega service start duration is $duration s" && sleep 1s

-    if [ "${mode}" == "perf" ]; then
-        python3 $WORKPATH/tests/chatqna_benchmark.py
-    elif [ "${mode}" == "" ]; then
-        validate_microservices
-        validate_megaservice
-        # validate_frontend
-    fi
+    validate_microservices
+    validate_megaservice
+    validate_frontend

    stop_docker
    echo y | docker system prune
--- a/ChatQnA/tests/test_compose_pinecone_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_pinecone_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -29,7 +30,12 @@ function build_docker_images() {

    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-    git clone --depth 1 https://github.com/vllm-project/vllm.git
+    git clone https://github.com/vllm-project/vllm.git && cd vllm
+    VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+    echo "Check out vLLM tag ${VLLM_VER}"
+    git checkout ${VLLM_VER} &> /dev/null
+    # Not change the pwd
+    cd ../

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    service_list="chatqna chatqna-ui dataprep retriever vllm nginx"
@@ -228,14 +234,10 @@ function main() {
    duration=$((end_time-start_time))
    echo "Mega service start duration is $duration s" && sleep 1s

-    if [ "${mode}" == "perf" ]; then
-        python3 $WORKPATH/tests/chatqna_benchmark.py
-    elif [ "${mode}" == "" ]; then
-        validate_microservices
-        echo "==== microservices validated ===="
-        validate_megaservice
-        echo "==== megaservice validated ===="
-    fi
+    validate_microservices
+    echo "==== microservices validated ===="
+    validate_megaservice
+    echo "==== megaservice validated ===="

    stop_docker
    echo y | docker system prune
--- a/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -29,7 +30,12 @@ function build_docker_images() {

    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-    git clone --depth 1 https://github.com/vllm-project/vllm.git
+    git clone https://github.com/vllm-project/vllm.git && cd vllm
+    VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+    echo "Check out vLLM tag ${VLLM_VER}"
+    git checkout ${VLLM_VER} &> /dev/null
+    # Not change the pwd
+    cd ../

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    service_list="chatqna chatqna-ui dataprep retriever vllm nginx"
--- a/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -46,6 +47,7 @@ function start_services() {
    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export NUM_CARDS=1
    export INDEX_NAME="rag-redis"
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
    export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
@@ -53,7 +55,7 @@ function start_services() {
    export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces

    # Start Docker Containers
-    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+    docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml up -d > ${LOG_PATH}/start_services_with_compose.log

    n=0
    until [[ "$n" -ge 500 ]]; do
@@ -217,7 +219,7 @@ function validate_frontend() {

 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    docker compose -f compose_tgi.yaml down
+    docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml down
 }

 function main() {
@@ -230,13 +232,9 @@ function main() {
    duration=$((end_time-start_time))
    echo "Mega service start duration is $duration s"

-    if [ "${mode}" == "perf" ]; then
-        python3 $WORKPATH/tests/chatqna_benchmark.py
-    elif [ "${mode}" == "" ]; then
-        validate_microservices
-        validate_megaservice
-        validate_frontend
-    fi
+    validate_microservices
+    validate_megaservice
+    validate_frontend

    stop_docker
    echo y | docker system prune
--- a/ChatQnA/tests/test_compose_tgi_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_tgi_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -48,9 +49,12 @@ function start_services() {
    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
    export INDEX_NAME="rag-redis"
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+    export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces

    # Start Docker Containers
-    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+    docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml up -d > ${LOG_PATH}/start_services_with_compose.log

    n=0
    until [[ "$n" -ge 100 ]]; do
@@ -216,7 +220,7 @@ function validate_frontend() {

 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/cpu/xeon
-    docker compose -f compose_tgi.yaml down
+    docker compose -f compose_tgi.yaml -f compose_tgi.telemetry.yaml down
 }

 function main() {
@@ -229,16 +233,12 @@ function main() {
    duration=$((end_time-start_time))
    echo "Mega service start duration is $duration s" && sleep 1s

-    if [ "${mode}" == "perf" ]; then
-        python3 $WORKPATH/tests/chatqna_benchmark.py
-    elif [ "${mode}" == "" ]; then
-        validate_microservices
-        echo "==== microservices validated ===="
-        validate_megaservice
-        echo "==== megaservice validated ===="
-        validate_frontend
-        echo "==== frontend validated ===="
-    fi
+    validate_microservices
+    echo "==== microservices validated ===="
+    validate_megaservice
+    echo "==== megaservice validated ===="
+    validate_frontend
+    echo "==== frontend validated ===="

    stop_docker
    echo y | docker system prune
--- a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -29,7 +30,6 @@ function build_docker_images() {
    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
    git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git
-    sed -i 's/triton/triton==3.1.0/g' vllm-fork/requirements-hpu.txt

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    service_list="chatqna-without-rerank chatqna-ui dataprep retriever vllm-gaudi nginx"
@@ -45,6 +45,7 @@ function start_services() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export NUM_CARDS=1
    export INDEX_NAME="rag-redis"
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}

@@ -218,13 +219,9 @@ function main() {
    duration=$((end_time-start_time))
    echo "Mega service start duration is $duration s"

-    if [ "${mode}" == "perf" ]; then
-        python3 $WORKPATH/tests/chatqna_benchmark.py
-    elif [ "${mode}" == "" ]; then
-        validate_microservices
-        validate_megaservice
-        validate_frontend
-    fi
+    validate_microservices
+    validate_megaservice
+    validate_frontend

    stop_docker
    echo y | docker system prune
--- a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -29,7 +30,13 @@ function build_docker_images() {

    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
-    git clone --depth 1 https://github.com/vllm-project/vllm.git
+    git clone https://github.com/vllm-project/vllm.git && cd vllm
+    # Get the latest tag
+    VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+    echo "Check out vLLM tag ${VLLM_VER}"
+    git checkout ${VLLM_VER} &> /dev/null
+    # Not change the pwd
+    cd ../

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
    service_list="chatqna-without-rerank chatqna-ui dataprep retriever vllm nginx"
@@ -219,16 +226,12 @@ function main() {
    duration=$((end_time-start_time))
    echo "Mega service start duration is $duration s" && sleep 1s

-    if [ "${mode}" == "perf" ]; then
-        python3 $WORKPATH/tests/chatqna_benchmark.py
-    elif [ "${mode}" == "" ]; then
-        validate_microservices
-        echo "==== microservices validated ===="
-        validate_megaservice
-        echo "==== megaservice validated ===="
-        validate_frontend
-        echo "==== frontend validated ===="
-    fi
+    validate_microservices
+    echo "==== microservices validated ===="
+    validate_megaservice
+    echo "==== megaservice validated ===="
+    validate_frontend
+    echo "==== frontend validated ===="

    stop_docker
    echo y | docker system prune
--- a/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
@@ -8,7 +8,7 @@ services:
    ports:
      - "8028:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
--- a/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -8,7 +8,7 @@ services:
    ports:
      - "8028:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
--- a/CodeGen/tests/test_compose_on_gaudi.sh
+++ b/CodeGen/tests/test_compose_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/CodeGen/tests/test_compose_on_xeon.sh
+++ b/CodeGen/tests/test_compose_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
--- a/CodeTrans/docker_compose/intel/cpu/xeon/README.md
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/README.md
@@ -2,6 +2,8 @@

 This document outlines the deployment process for a CodeTrans application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution using microservices `llm`. We will publish the Docker images to Docker Hub soon, it will simplify the deployment process for this service.

+The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [start-microservice-docker-containers](#start-microservice-docker-containers) section in this page.
+
 ## 🚀 Create an AWS Xeon Instance

 To run the example on a AWS Xeon instance, start by creating an AWS account if you don't have one already. Then, get started with the [EC2 Console](https://console.aws.amazon.com/ec2/v2/home). AWS EC2 M7i, C7i, C7i-flex and M7i-flex are Intel Xeon Scalable processor instances suitable for the task. (code named Sapphire Rapids).
@@ -63,6 +65,37 @@ By default, the LLM model is set to a default value as listed below:

 Change the `LLM_MODEL_ID` below for your needs.

+For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. The vLLM/TGI can load the models either online or offline as described below:
+
+1. Online
+
+   ```bash
+   export HF_TOKEN=${your_hf_token}
+   export HF_ENDPOINT="https://hf-mirror.com"
+   model_name="mistralai/Mistral-7B-Instruct-v0.3"
+   # Start vLLM LLM Service
+   docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
+   # Start TGI LLM Service
+   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
+   ```
+
+2. Offline
+
+   - Search your model name in ModelScope. For example, check [this page](https://www.modelscope.cn/models/rubraAI/Mistral-7B-Instruct-v0.3/files) for model `mistralai/Mistral-7B-Instruct-v0.3`.
+
+   - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`.
+
+   - Run the following command to start the LLM service.
+
+     ```bash
+     export HF_TOKEN=${your_hf_token}
+     export model_path="/path/to/model"
+     # Start vLLM LLM Service
+     docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
+     # Start TGI LLM Service
+     docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
+     ```
+
 ### Setup Environment Variables

 1. Set the required environment variables:
@@ -95,15 +128,47 @@ Change the `LLM_MODEL_ID` below for your needs.

 ```bash
 cd GenAIExamples/CodeTrans/docker_compose/intel/cpu/xeon
-docker compose up -d
+```
+
+If use vLLM as the LLM serving backend.
+
+```bash
+docker compose -f compose.yaml up -d
+```
+
+If use TGI as the LLM serving backend.
+
+```bash
+docker compose -f compose_tgi.yaml up -d
 ```

 ### Validate Microservices

-1. TGI Service
+1. LLM backend Service
+
+   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
+
+   Try the command below to check whether the LLM serving is ready.

   ```bash
-   curl http://${host_ip}:8008/generate \
+   # vLLM service
+   docker logs codetrans-xeon-vllm-service 2>&1 | grep complete
+   # If the service is ready, you will get the response like below.
+   INFO:     Application startup complete.
+   ```
+
+   ```bash
+   # TGI service
+   docker logs codetrans-xeon-tgi-service | grep Connected
+   # If the service is ready, you will get the response like below.
+   2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
+   ```
+
+   Then try the `cURL` command below to validate services.
+
+   ```bash
+   # either vLLM or TGI service
+   curl http://${host_ip}:8008/v1/chat/completions \
     -X POST \
     -d '{"inputs":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:","parameters":{"max_new_tokens":17, "do_sample": true}}' \
     -H 'Content-Type: application/json'
--- a/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
@@ -2,31 +2,32 @@
 # SPDX-License-Identifier: Apache-2.0

 services:
-  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
-    container_name: codetrans-tgi-service
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+    container_name: codetrans-xeon-vllm-service
    ports:
      - "8008:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    shm_size: 1g
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      host_ip: ${host_ip}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"]
      interval: 10s
      timeout: 10s
      retries: 100
-    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
  llm:
    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
-    container_name: llm-textgen-server
+    container_name: codetrans-xeon-llm-server
    depends_on:
-      tgi-service:
+      vllm-service:
        condition: service_healthy
    ports:
      - "9000:9000"
@@ -35,18 +36,19 @@ services:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
      LLM_MODEL_ID: ${LLM_MODEL_ID}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    restart: unless-stopped
  codetrans-xeon-backend-server:
    image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
    container_name: codetrans-xeon-backend-server
    depends_on:
-      - tgi-service
+      - vllm-service
      - llm
    ports:
-      - "7777:7777"
+      - "${BACKEND_SERVICE_PORT:-7777}:7777"
    environment:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
@@ -61,7 +63,7 @@ services:
    depends_on:
      - codetrans-xeon-backend-server
    ports:
-      - "5173:5173"
+      - "${FRONTEND_SERVICE_PORT:-5173}:5173"
    environment:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
--- a/CodeTrans/docker_compose/intel/cpu/xeon/compose_tgi.yaml
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/compose_tgi.yaml
@@ -0,0 +1,95 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  tgi-service:
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    container_name: codetrans-xeon-tgi-service
+    ports:
+      - "8008:80"
+    volumes:
+      - "${MODEL_CACHE}:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      host_ip: ${host_ip}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+  llm:
+    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
+    container_name: codetrans-xeon-llm-server
+    depends_on:
+      tgi-service:
+        condition: service_healthy
+    ports:
+      - "9000:9000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+    restart: unless-stopped
+  codetrans-xeon-backend-server:
+    image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
+    container_name: codetrans-xeon-backend-server
+    depends_on:
+      - tgi-service
+      - llm
+    ports:
+      - "${BACKEND_SERVICE_PORT:-7777}:7777"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+    ipc: host
+    restart: always
+  codetrans-xeon-ui-server:
+    image: ${REGISTRY:-opea}/codetrans-ui:${TAG:-latest}
+    container_name: codetrans-xeon-ui-server
+    depends_on:
+      - codetrans-xeon-backend-server
+    ports:
+      - "${FRONTEND_SERVICE_PORT:-5173}:5173"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+    ipc: host
+    restart: always
+  codetrans-xeon-nginx-server:
+    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
+    container_name: codetrans-xeon-nginx-server
+    depends_on:
+      - codetrans-xeon-backend-server
+      - codetrans-xeon-ui-server
+    ports:
+      - "${NGINX_PORT:-80}:80"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - FRONTEND_SERVICE_IP=${FRONTEND_SERVICE_IP}
+      - FRONTEND_SERVICE_PORT=${FRONTEND_SERVICE_PORT}
+      - BACKEND_SERVICE_NAME=${BACKEND_SERVICE_NAME}
+      - BACKEND_SERVICE_IP=${BACKEND_SERVICE_IP}
+      - BACKEND_SERVICE_PORT=${BACKEND_SERVICE_PORT}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/CodeTrans/docker_compose/intel/hpu/gaudi/README.md
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/README.md
@@ -2,6 +2,8 @@

 This document outlines the deployment process for a CodeTrans application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server. The steps include Docker image creation, container deployment via Docker Compose, and service execution using microservices `llm`. We will publish the Docker images to Docker Hub soon, it will simplify the deployment process for this service.

+The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [start-microservice-docker-containers](#start-microservice-docker-containers) section in this page.
+
 ## 🚀 Build Docker Images

 First of all, you need to build Docker Images locally and install the python package of it. This step can be ignored after the Docker images published to Docker hub.
@@ -55,6 +57,37 @@ By default, the LLM model is set to a default value as listed below:

 Change the `LLM_MODEL_ID` below for your needs.

+For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. The vLLM/TGI can load the models either online or offline as described below:
+
+1. Online
+
+   ```bash
+   export HF_TOKEN=${your_hf_token}
+   export HF_ENDPOINT="https://hf-mirror.com"
+   model_name="mistralai/Mistral-7B-Instruct-v0.3"
+   # Start vLLM LLM Service
+   docker run -p 8008:80 -v ./data:/data --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80
+   # Start TGI LLM Service
+   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
+   ```
+
+2. Offline
+
+   - Search your model name in ModelScope. For example, check [this page](https://www.modelscope.cn/models/rubraAI/Mistral-7B-Instruct-v0.3/files) for model `mistralai/Mistral-7B-Instruct-v0.3`.
+
+   - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`.
+
+   - Run the following command to start the LLM service.
+
+     ```bash
+     export HF_TOKEN=${your_hf_token}
+     export model_path="/path/to/model"
+     # Start vLLM LLM Service
+     docker run -p 8008:80 -v $model_path:/data --name vllm-service --shm-size 128g opea/vllm:latest --model /data --host 0.0.0.0 --port 80
+     # Start TGI LLM Service
+     docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
+     ```
+
 ### Setup Environment Variables

 1. Set the required environment variables:
@@ -87,12 +120,43 @@ Change the `LLM_MODEL_ID` below for your needs.

 ```bash
 cd GenAIExamples/CodeTrans/docker_compose/intel/hpu/gaudi
-docker compose up -d
+```
+
+If use vLLM as the LLM serving backend.
+
+```bash
+docker compose -f compose.yaml up -d
+```
+
+If use TGI as the LLM serving backend.
+
+```bash
+docker compose -f compose_tgi.yaml up -d
 ```

 ### Validate Microservices

-1. TGI Service
+1. LLM backend Service
+
+   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
+
+   Try the command below to check whether the LLM serving is ready.
+
+   ```bash
+   # vLLM service
+   docker logs codetrans-gaudi-vllm-service 2>&1 | grep complete
+   # If the service is ready, you will get the response like below.
+   INFO:     Application startup complete.
+   ```
+
+   ```bash
+   # TGI service
+   docker logs codetrans-gaudi-tgi-service | grep Connected
+   # If the service is ready, you will get the response like below.
+   2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
+   ```
+
+   Then try the `cURL` command below to validate services.

   ```bash
   curl http://${host_ip}:8008/generate \
--- a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -2,39 +2,38 @@
 # SPDX-License-Identifier: Apache-2.0

 services:
-  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
-    container_name: codetrans-tgi-service
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+    container_name: codetrans-gaudi-vllm-service
    ports:
      - "8008:80"
    volumes:
-      - "./data:/data"
+      - "${MODEL_CACHE:-./data}:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      NUM_CARDS: ${NUM_CARDS}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
    healthcheck:
-      test: ["CMD-SHELL", "sleep 500 && exit 0"]
-      interval: 1s
-      timeout: 505s
-      retries: 1
+      test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
    runtime: habana
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
+    command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
  llm:
    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
-    container_name: llm-textgen-gaudi-server
+    container_name: codetrans-xeon-llm-server
    depends_on:
-      tgi-service:
+      vllm-service:
        condition: service_healthy
    ports:
      - "9000:9000"
@@ -43,18 +42,19 @@ services:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
      LLM_MODEL_ID: ${LLM_MODEL_ID}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
    restart: unless-stopped
  codetrans-gaudi-backend-server:
    image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
    container_name: codetrans-gaudi-backend-server
    depends_on:
-      - tgi-service
+      - vllm-service
      - llm
    ports:
-      - "7777:7777"
+      - "${BACKEND_SERVICE_PORT:-7777}:7777"
    environment:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
@@ -69,7 +69,7 @@ services:
    depends_on:
      - codetrans-gaudi-backend-server
    ports:
-      - "5173:5173"
+      - "${FRONTEND_SERVICE_PORT:-5173}:5173"
    environment:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
--- a/CodeTrans/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
+++ b/CodeTrans/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -0,0 +1,99 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  tgi-service:
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
+    container_name: codetrans-gaudi-tgi-service
+    ports:
+      - "8008:80"
+    volumes:
+      - "${MODEL_CACHE}:/data"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      ENABLE_HPU_GRAPH: true
+      LIMIT_HPU_GRAPH: true
+      USE_FLASH_ATTENTION: true
+      FLASH_ATTENTION_RECOMPUTE: true
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096
+  llm:
+    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
+    container_name: codetrans-gaudi-llm-server
+    depends_on:
+      - tgi-service
+    ports:
+      - "9000:9000"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+    restart: unless-stopped
+  codetrans-gaudi-backend-server:
+    image: ${REGISTRY:-opea}/codetrans:${TAG:-latest}
+    container_name: codetrans-gaudi-backend-server
+    depends_on:
+      - tgi-service
+      - llm
+    ports:
+      - "${BACKEND_SERVICE_PORT:-7777}:7777"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+    ipc: host
+    restart: always
+  codetrans-gaudi-ui-server:
+    image: ${REGISTRY:-opea}/codetrans-ui:${TAG:-latest}
+    container_name: codetrans-gaudi-ui-server
+    depends_on:
+      - codetrans-gaudi-backend-server
+    ports:
+      - "${FRONTEND_SERVICE_PORT:-5173}:5173"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+    ipc: host
+    restart: always
+  codetrans-gaudi-nginx-server:
+    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
+    container_name: codetrans-gaudi-nginx-server
+    depends_on:
+      - codetrans-gaudi-backend-server
+      - codetrans-gaudi-ui-server
+    ports:
+      - "${NGINX_PORT:-80}:80"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - FRONTEND_SERVICE_IP=${FRONTEND_SERVICE_IP}
+      - FRONTEND_SERVICE_PORT=${FRONTEND_SERVICE_PORT}
+      - BACKEND_SERVICE_NAME=${BACKEND_SERVICE_NAME}
+      - BACKEND_SERVICE_IP=${BACKEND_SERVICE_IP}
+      - BACKEND_SERVICE_PORT=${BACKEND_SERVICE_PORT}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
--- a/CodeTrans/docker_compose/set_env.sh
+++ b/CodeTrans/docker_compose/set_env.sh
@@ -8,7 +8,12 @@ popd > /dev/null


 export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
-export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
+export LLM_ENDPOINT="http://${host_ip}:8008"
+export LLM_COMPONENT_NAME="OpeaTextGenService"
+export NUM_CARDS=1
+export BLOCK_SIZE=128
+export MAX_NUM_SEQS=256
+export MAX_SEQ_LEN_TO_CAPTURE=2048
 export MEGA_SERVICE_HOST_IP=${host_ip}
 export LLM_SERVICE_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:7777/v1/codetrans"
--- a/CodeTrans/docker_image_build/build.yaml
+++ b/CodeTrans/docker_image_build/build.yaml
@@ -23,6 +23,18 @@ services:
      dockerfile: comps/llms/src/text-generation/Dockerfile
    extends: codetrans
    image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
+  vllm:
+    build:
+      context: vllm
+      dockerfile: Dockerfile.cpu
+    extends: codetrans
+    image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+  vllm-gaudi:
+    build:
+      context: vllm-fork
+      dockerfile: Dockerfile.hpu
+    extends: codetrans
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
  nginx:
    build:
      context: GenAIComps
--- a/CodeTrans/tests/test_compose_on_gaudi.sh
+++ b/CodeTrans/tests/test_compose_on_gaudi.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -29,12 +30,12 @@ function build_docker_images() {

    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+    git clone --depth 1 --branch v0.6.4.post2+Gaudi-1.19.0 https://github.com/HabanaAI/vllm-fork.git

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="codetrans codetrans-ui llm-textgen nginx"
+    service_list="codetrans codetrans-ui llm-textgen vllm-gaudi nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
    docker images && sleep 1s
 }

@@ -44,7 +45,12 @@ function start_services() {
    export http_proxy=${http_proxy}
    export https_proxy=${http_proxy}
    export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
-    export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
+    export LLM_ENDPOINT="http://${ip_address}:8008"
+    export LLM_COMPONENT_NAME="OpeaTextGenService"
+    export NUM_CARDS=1
+    export BLOCK_SIZE=128
+    export MAX_NUM_SEQS=256
+    export MAX_SEQ_LEN_TO_CAPTURE=2048
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
    export MEGA_SERVICE_HOST_IP=${ip_address}
    export LLM_SERVICE_HOST_IP=${ip_address}
@@ -64,13 +70,15 @@ function start_services() {

    n=0
    until [[ "$n" -ge 100 ]]; do
-        docker logs codetrans-tgi-service > ${LOG_PATH}/tgi_service_start.log
-        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+        docker logs codetrans-gaudi-vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
+        if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
            break
        fi
        sleep 5s
        n=$((n+1))
    done
+
+    sleep 1m
 }

 function validate_services() {
@@ -102,27 +110,19 @@ function validate_services() {
 }

 function validate_microservices() {
-    # tgi for embedding service
-    validate_services \
-        "${ip_address}:8008/generate" \
-        "generated_text" \
-        "tgi" \
-        "codetrans-tgi-service" \
-        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-
    # llm microservice
    validate_services \
        "${ip_address}:9000/v1/chat/completions" \
        "data: " \
        "llm" \
-        "llm-textgen-gaudi-server" \
+        "codetrans-xeon-llm-server" \
        '{"query":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:"}'
 }

 function validate_megaservice() {
    # Curl the Mega Service
    validate_services \
-        "${ip_address}:7777/v1/codetrans" \
+        "${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
        "print" \
        "mega-codetrans" \
        "codetrans-gaudi-backend-server" \
@@ -130,7 +130,7 @@ function validate_megaservice() {

    # test the megeservice via nginx
    validate_services \
-        "${ip_address}:80/v1/codetrans" \
+        "${ip_address}:${NGINX_PORT}/v1/codetrans" \
        "print" \
        "mega-codetrans-nginx" \
        "codetrans-gaudi-nginx-server" \
@@ -169,7 +169,7 @@ function validate_frontend() {

 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    docker compose stop && docker compose rm -f
+    docker compose -f compose.yaml stop && docker compose rm -f
 }

 function main() {
--- a/CodeTrans/tests/test_compose_on_xeon.sh
+++ b/CodeTrans/tests/test_compose_on_xeon.sh
@@ -9,6 +9,7 @@ echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}

 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -29,12 +30,16 @@ function build_docker_images() {

    cd $WORKPATH/docker_image_build
    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+    git clone https://github.com/vllm-project/vllm.git && cd vllm
+    VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )"
+    echo "Check out vLLM tag ${VLLM_VER}"
+    git checkout ${VLLM_VER} &> /dev/null
+    cd ../

    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="codetrans codetrans-ui llm-textgen nginx"
+    service_list="codetrans codetrans-ui llm-textgen vllm nginx"
    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log

-    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
    docker images && sleep 1s
 }

@@ -43,7 +48,8 @@ function start_services() {
    export http_proxy=${http_proxy}
    export https_proxy=${http_proxy}
    export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
-    export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
+    export LLM_ENDPOINT="http://${ip_address}:8008"
+    export LLM_COMPONENT_NAME="OpeaTextGenService"
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
    export MEGA_SERVICE_HOST_IP=${ip_address}
    export LLM_SERVICE_HOST_IP=${ip_address}
@@ -59,17 +65,19 @@ function start_services() {
    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env

    # Start Docker Containers
-    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
+    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log

    n=0
    until [[ "$n" -ge 100 ]]; do
-        docker logs codetrans-tgi-service > ${LOG_PATH}/tgi_service_start.log
-        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+        docker logs codetrans-xeon-vllm-service > ${LOG_PATH}/vllm_service_start.log 2>&1
+        if grep -q complete ${LOG_PATH}/vllm_service_start.log; then
            break
        fi
        sleep 5s
        n=$((n+1))
    done
+
+    sleep 1m
 }

 function validate_services() {
@@ -101,20 +109,12 @@ function validate_services() {
 }

 function validate_microservices() {
-    # tgi for embedding service
-    validate_services \
-        "${ip_address}:8008/generate" \
-        "generated_text" \
-        "tgi" \
-        "codetrans-tgi-service" \
-        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-
    # llm microservice
    validate_services \
        "${ip_address}:9000/v1/chat/completions" \
        "data: " \
        "llm" \
-        "llm-textgen-server" \
+        "codetrans-xeon-llm-server" \
        '{"query":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:"}'

 }
@@ -122,7 +122,7 @@ function validate_microservices() {
 function validate_megaservice() {
    # Curl the Mega Service
    validate_services \
-        "${ip_address}:7777/v1/codetrans" \
+        "${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
        "print" \
        "mega-codetrans" \
        "codetrans-xeon-backend-server" \
@@ -130,7 +130,7 @@ function validate_megaservice() {

    # test the megeservice via nginx
    validate_services \
-        "${ip_address}:80/v1/codetrans" \
+        "${ip_address}:${NGINX_PORT}/v1/codetrans" \
        "print" \
        "mega-codetrans-nginx" \
        "codetrans-xeon-nginx-server" \
@@ -168,7 +168,7 @@ function validate_frontend() {

 function stop_docker() {
    cd $WORKPATH/docker_compose/intel/cpu/xeon/
-    docker compose stop && docker compose rm -f
+    docker compose -f compose.yaml stop && docker compose rm -f
 }

 function main() {
--- a/CodeTrans/tests/test_compose_tgi_on_gaudi.sh
+++ b/CodeTrans/tests/test_compose_tgi_on_gaudi.sh
@@ -0,0 +1,194 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi
+
+    cd $WORKPATH/docker_image_build
+    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="codetrans codetrans-ui llm-textgen nginx"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+    docker images && sleep 1s
+}
+
+function start_services() {
+    cd $WORKPATH/docker_compose/intel/hpu/gaudi/
+    export http_proxy=${http_proxy}
+    export https_proxy=${http_proxy}
+    export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
+    export LLM_ENDPOINT="http://${ip_address}:8008"
+    export LLM_COMPONENT_NAME="OpeaTextGenService"
+    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+    export MEGA_SERVICE_HOST_IP=${ip_address}
+    export LLM_SERVICE_HOST_IP=${ip_address}
+    export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:7777/v1/codetrans"
+    export FRONTEND_SERVICE_IP=${ip_address}
+    export FRONTEND_SERVICE_PORT=5173
+    export BACKEND_SERVICE_NAME=codetrans
+    export BACKEND_SERVICE_IP=${ip_address}
+    export BACKEND_SERVICE_PORT=7777
+    export NGINX_PORT=80
+    export host_ip=${ip_address}
+
+    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+
+    # Start Docker Containers
+    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+    n=0
+    until [[ "$n" -ge 100 ]]; do
+        docker logs codetrans-gaudi-tgi-service > ${LOG_PATH}/tgi_service_start.log
+        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+            break
+        fi
+        sleep 5s
+        n=$((n+1))
+    done
+
+    sleep 1m
+}
+
+function validate_services() {
+    local URL="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+    local DOCKER_NAME="$4"
+    local INPUT_DATA="$5"
+
+    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+            echo "[ $SERVICE_NAME ] Content is as expected."
+        else
+            echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+            exit 1
+        fi
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+        exit 1
+    fi
+    sleep 5s
+}
+
+function validate_microservices() {
+    # tgi for embedding service
+    validate_services \
+        "${ip_address}:8008/generate" \
+        "generated_text" \
+        "tgi" \
+        "codetrans-gaudi-tgi-service" \
+        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+
+    # llm microservice
+    validate_services \
+        "${ip_address}:9000/v1/chat/completions" \
+        "data: " \
+        "llm" \
+        "codetrans-gaudi-llm-server" \
+        '{"query":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:"}'
+
+}
+
+function validate_megaservice() {
+    # Curl the Mega Service
+    validate_services \
+        "${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
+        "print" \
+        "mega-codetrans" \
+        "codetrans-gaudi-backend-server" \
+        '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}"}'
+
+    # test the megeservice via nginx
+    validate_services \
+        "${ip_address}:${NGINX_PORT}/v1/codetrans" \
+        "print" \
+        "mega-codetrans-nginx" \
+        "codetrans-gaudi-nginx-server" \
+        '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}"}'
+
+}
+
+function validate_frontend() {
+    cd $WORKPATH/ui/svelte
+    local conda_env_name="OPEA_e2e"
+    export PATH=${HOME}/miniforge3/bin/:$PATH
+    if conda info --envs | grep -q "$conda_env_name"; then
+        echo "$conda_env_name exist!"
+    else
+        conda create -n ${conda_env_name} python=3.12 -y
+    fi
+    source activate ${conda_env_name}
+
+    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+
+    conda install -c conda-forge nodejs=22.6.0 -y
+    npm install && npm ci && npx playwright install --with-deps
+    node -v && npm -v && pip list
+
+    exit_status=0
+    npx playwright test || exit_status=$?
+
+    if [ $exit_status -ne 0 ]; then
+        echo "[TEST INFO]: ---------frontend test failed---------"
+        exit $exit_status
+    else
+        echo "[TEST INFO]: ---------frontend test passed---------"
+    fi
+}
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/intel/hpu/gaudi/
+    docker compose -f compose_tgi.yaml stop && docker compose rm -f
+}
+
+function main() {
+
+    stop_docker
+
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_services
+
+    validate_microservices
+    validate_megaservice
+    validate_frontend
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/CodeTrans/tests/test_compose_tgi_on_xeon.sh
+++ b/CodeTrans/tests/test_compose_tgi_on_xeon.sh
@@ -0,0 +1,194 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+IMAGE_TAG=${IMAGE_TAG:-"latest"}
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
+export REGISTRY=${IMAGE_REPO}
+export TAG=${IMAGE_TAG}
+export MODEL_CACHE=${model_cache:-"./data"}
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    opea_branch=${opea_branch:-"main"}
+    # If the opea_branch isn't main, replace the git clone branch in Dockerfile.
+    if [[ "${opea_branch}" != "main" ]]; then
+        cd $WORKPATH
+        OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git"
+        NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git"
+        find . -type f -name "Dockerfile*" | while read -r file; do
+            echo "Processing file: $file"
+            sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file"
+        done
+    fi
+
+    cd $WORKPATH/docker_image_build
+    git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
+
+    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
+    service_list="codetrans codetrans-ui llm-textgen nginx"
+    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
+
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    docker images && sleep 1s
+}
+
+function start_services() {
+    cd $WORKPATH/docker_compose/intel/cpu/xeon/
+    export http_proxy=${http_proxy}
+    export https_proxy=${http_proxy}
+    export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
+    export LLM_ENDPOINT="http://${ip_address}:8008"
+    export LLM_COMPONENT_NAME="OpeaTextGenService"
+    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+    export MEGA_SERVICE_HOST_IP=${ip_address}
+    export LLM_SERVICE_HOST_IP=${ip_address}
+    export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:7777/v1/codetrans"
+    export FRONTEND_SERVICE_IP=${ip_address}
+    export FRONTEND_SERVICE_PORT=5173
+    export BACKEND_SERVICE_NAME=codetrans
+    export BACKEND_SERVICE_IP=${ip_address}
+    export BACKEND_SERVICE_PORT=7777
+    export NGINX_PORT=80
+    export host_ip=${ip_address}
+
+    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+
+    # Start Docker Containers
+    docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
+    n=0
+    until [[ "$n" -ge 100 ]]; do
+        docker logs codetrans-xeon-tgi-service > ${LOG_PATH}/tgi_service_start.log
+        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
+            break
+        fi
+        sleep 5s
+        n=$((n+1))
+    done
+
+    sleep 1m
+}
+
+function validate_services() {
+    local URL="$1"
+    local EXPECTED_RESULT="$2"
+    local SERVICE_NAME="$3"
+    local DOCKER_NAME="$4"
+    local INPUT_DATA="$5"
+
+    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+    if [ "$HTTP_STATUS" -eq 200 ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+
+        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+
+        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
+            echo "[ $SERVICE_NAME ] Content is as expected."
+        else
+            echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
+            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+            exit 1
+        fi
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
+        exit 1
+    fi
+    sleep 5s
+}
+
+function validate_microservices() {
+    # tgi for embedding service
+    validate_services \
+        "${ip_address}:8008/generate" \
+        "generated_text" \
+        "tgi" \
+        "codetrans-xeon-tgi-service" \
+        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
+
+    # llm microservice
+    validate_services \
+        "${ip_address}:9000/v1/chat/completions" \
+        "data: " \
+        "llm" \
+        "codetrans-xeon-llm-server" \
+        '{"query":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:"}'
+
+}
+
+function validate_megaservice() {
+    # Curl the Mega Service
+    validate_services \
+        "${ip_address}:${BACKEND_SERVICE_PORT}/v1/codetrans" \
+        "print" \
+        "mega-codetrans" \
+        "codetrans-xeon-backend-server" \
+        '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}"}'
+
+    # test the megeservice via nginx
+    validate_services \
+        "${ip_address}:${NGINX_PORT}/v1/codetrans" \
+        "print" \
+        "mega-codetrans-nginx" \
+        "codetrans-xeon-nginx-server" \
+        '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}"}'
+
+}
+
+function validate_frontend() {
+    cd $WORKPATH/ui/svelte
+    local conda_env_name="OPEA_e2e"
+    export PATH=${HOME}/miniforge3/bin/:$PATH
+    if conda info --envs | grep -q "$conda_env_name"; then
+        echo "$conda_env_name exist!"
+    else
+        conda create -n ${conda_env_name} python=3.12 -y
+    fi
+    source activate ${conda_env_name}
+
+    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+
+    conda install -c conda-forge nodejs=22.6.0 -y
+    npm install && npm ci && npx playwright install --with-deps
+    node -v && npm -v && pip list
+
+    exit_status=0
+    npx playwright test || exit_status=$?
+
+    if [ $exit_status -ne 0 ]; then
+        echo "[TEST INFO]: ---------frontend test failed---------"
+        exit $exit_status
+    else
+        echo "[TEST INFO]: ---------frontend test passed---------"
+    fi
+}
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/intel/cpu/xeon/
+    docker compose -f compose_tgi.yaml stop && docker compose rm -f
+}
+
+function main() {
+
+    stop_docker
+
+    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    start_services
+
+    validate_microservices
+    validate_megaservice
+    validate_frontend
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
chen, suyue	de96cd4dcf	Merge branch 'main' into ft	2025-03-07 15:05:27 +08:00
chen, suyue	555e2405b9	Fix corner CI issue when the example path deleted (#1634 ) Signed-off-by: chensuyue <suyue.chen@intel.com>	2025-03-07 15:05:08 +08:00
Shifani Rajabose	7a92435269	[Bug: 112] Fix introduction in GenAIExamples main README (#1631 )	2025-03-07 14:31:34 +08:00
Eero Tamminen	c9085c3c68	Use GenAIComp base image to simplify Dockerfiles (#1612 ) Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>	2025-03-07 13:13:29 +08:00
ZePan110	36aaed748b	Update model cache for AgentQnA (#1627 ) Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-03-07 11:00:48 +08:00
Letong Han	9180f1066d	Enable vllm for CodeTrans (#1626 ) Set vllm as default llm serving, and add related docker compose files, readmes, and test scripts. Issue: https://github.com/opea-project/GenAIExamples/issues/1436 Signed-off-by: letonghan <letong.han@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>	2025-03-07 10:56:21 +08:00
ZePan110	5aecea8e47	Update compose.yaml (#1619 ) Update compose.yaml for CodeGen, CodeTrans and DocSum Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-03-07 09:20:28 +08:00
ZePan110	6723395e31	Update compose.yaml (#1620 ) Update compose.yaml for AudioQnA, DBQnA, DocIndexRetriever, FaqGen, Translation and VisualQnA. Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-03-07 09:20:08 +08:00
ZePan110	785ffb9a1e	Update compose.yaml for ChatQnA (#1621 ) Update compose.yaml for ChatQnA Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-03-07 09:19:39 +08:00
ZePan110	428ba481b2	Update compose.yaml for SearchQnA (#1622 ) Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-03-07 08:38:59 +08:00
pre-commit-ci[bot]	769105b986	[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci	2025-03-06 00:43:57 -05:00
Wang, Kai Lawrence	2dfcfa0436	[AudioQnA] Fix the LLM model field for inputs alignment (#1611 ) Signed-off-by: Wang, Kai Lawrence <kai.lawrence.wang@intel.com>	2025-03-05 22:15:07 +08:00
Zhu Yongbo	8a5ad1fc72	Fix docker image opea/edgecraftrag security issue #1577 (#1617 ) Signed-off-by: Zhu, Yongbo <yongbo.zhu@intel.com>	2025-03-05 22:13:53 +08:00
ZePan110	24cacaaa48	Enable SearchQnA model cache for docker compose test. (#1606 ) Enable SearchQnA model cache for docker compose test. Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-03-05 17:13:24 +08:00
ZePan110	6ead1b12db	Enable ChatQnA model cache for docker compose test. (#1605 ) Enable ChatQnA model cache for docker compose test. Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-03-05 11:30:04 +08:00
rbrugaro	8dac9d1035	bugfix GraphRAG updated docker compose and env settings to fix issues post refactor (#1567 ) Signed-off-by: rbrugaro <rita.brugarolas.brufau@intel.com> Signed-off-by: Rita Brugarolas Brufau <rita.brugarolas.brufau@intel.com> Co-authored-by: chen, suyue <suyue.chen@intel.com> Co-authored-by: WenjiaoYue <wenjiao.yue@intel.com>	2025-03-04 09:44:13 -08:00
ZePan110	c1b5ba281f	Enable CodeGen,CodeTrans and DocSum model cache for docker compose test. (#1599 ) 1.Add cache path check 2.Enable CodeGen,CodeTrans and DocSum model cache for docker compose test. Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-03-04 16:10:20 +08:00
Ye, Xinyu	641f60c76c	merged InstructionTuning and RerankFinetuning into Finetuning. Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>	2025-03-04 01:13:31 -05:00
chen, suyue	8f8d3af7c3	open chatqna frontend test (#1594 ) Signed-off-by: chensuyue <suyue.chen@intel.com>	2025-03-04 10:41:22 +08:00
ZePan110	e4de76da78	Use model cache for docker compose test (#1582 ) Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-03-04 09:48:27 +08:00
Spycsh	ce38a84372	Revert chatqna async and enhance tests (#1598 ) align with opea-project/GenAIComps#1354	2025-03-03 23:03:44 +08:00
Ying Hu	e8b07c28ec	Update DBQnA tgi docker image to latest tgi 2.4.0 (#1593 )	2025-03-03 16:17:19 +08:00
chen, suyue	7b3a125bdf	Fix cd workflow condition (#1588 ) Fix cd workflow condition Signed-off-by: chensuyue <suyue.chen@intel.com> Co-authored-by: ZePan110 <ze.pan@intel.com>	2025-03-03 08:45:10 +08:00
Eze Lanza (Eze)	fba0de45d2	ChatQnA Docker compose file for Milvus as vdb (#1548 ) Signed-off-by: Ezequiel Lanza <ezequiel.lanza@gmail.com> Signed-off-by: Kendall González León <kendall.gonzalez.leon@intel.com> Signed-off-by: chensuyue <suyue.chen@intel.com> Signed-off-by: Spycsh <sihan.chen@intel.com> Signed-off-by: Wang, Xigui <xigui.wang@intel.com> Signed-off-by: ZePan110 <ze.pan@intel.com> Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: minmin-intel <minmin.hou@intel.com> Signed-off-by: Artem Astafev <a.astafev@datamonsters.com> Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> Signed-off-by: Cathy Zhang <cathy.zhang@intel.com> Signed-off-by: letonghan <letong.han@intel.com> Signed-off-by: alexsin368 <alex.sin@intel.com> Signed-off-by: WenjiaoYue <wenjiao.yue@intel.com> Co-authored-by: Ezequiel Lanza <emlanza@CDQ242RKJDmac.local> Co-authored-by: Kendall González León <kendallgonzalez@hotmail.es> Co-authored-by: chen, suyue <suyue.chen@intel.com> Co-authored-by: Spycsh <39623753+Spycsh@users.noreply.github.com> Co-authored-by: xiguiw <111278656+xiguiw@users.noreply.github.com> Co-authored-by: jotpalch <49465120+jotpalch@users.noreply.github.com> Co-authored-by: ZePan110 <ze.pan@intel.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: minmin-intel <minmin.hou@intel.com> Co-authored-by: Ying Hu <ying.hu@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eero Tamminen <eero.t.tamminen@intel.com> Co-authored-by: Liang Lv <liang1.lv@intel.com> Co-authored-by: Artem Astafev <a.astafev@datamonsters.com> Co-authored-by: XinyaoWa <xinyao.wang@intel.com> Co-authored-by: alexsin368 <109180236+alexsin368@users.noreply.github.com> Co-authored-by: WenjiaoYue <wenjiao.yue@intel.com>	2025-02-28 22:40:31 +08:00
WenjiaoYue	f2a5644d9c	fix click example button issue (#1586 ) Signed-off-by: WenjiaoYue <wenjiao.yue@intel.com>	2025-02-28 16:10:58 +08:00
alexsin368	6cd7827365	Top level README: add link to github.io documentation (#1584 ) Signed-off-by: alexsin368 <alex.sin@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>	2025-02-28 13:43:43 +08:00
chen, suyue	3d8009aa91	Fix benchmark scripts (#1517 ) - Align benchmark default config: 1. Update default helm charts version. 2. Add `# mandatory` comment. 3. Update default model ID for LLM. - Fix deploy issue: 1. Support different `replicaCount` for w/ w/o rerank test. 2. Add `max_num_seqs` for vllm. 3. Add resource setting for tune mode. - Fix Benchmark issue: 1. Update `user_queries` and `concurrency` setting. 2. Remove invalid parameters. 3. Fix `dataset` and `prompt` setting. And dataset ingest into db. 5. Fix the benchmark hang issue with large user queries. Update `"processes": 16` will fix this issue. 6. Update the eval_path setting logical. - Optimize benchmark readme. - Optimize the log path to make the logs more readable. Signed-off-by: chensuyue <suyue.chen@intel.com> Signed-off-by: Cathy Zhang <cathy.zhang@intel.com> Signed-off-by: letonghan <letong.han@intel.com>	2025-02-28 10:30:54 +08:00
XinyaoWa	78f8ae524d	Fix async in chatqna bug (#1589 ) Algin async with comps: related PR: opea-project/GenAIComps#1300 Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>	2025-02-27 23:32:29 +08:00
Artem Astafev	6abf7652e8	Fix ChatQnA ROCm compose Readme file and absolute path for ROCM CI test (#1159 ) Signed-off-by: Artem Astafev <a.astafev@datamonsters.com>	2025-02-27 15:26:45 +08:00
Spycsh	25c1aefc27	Align mongo related image names with comps (#1543 ) - chathistory-mongo-server -> chathistory-mongo (except container names) - feedbackmanagement -> feedbackmanagement-mongo - promptregistry-server/promptregistry-mongo-server -> promptregistry-mongo (except container names) Signed-off-by: Spycsh <sihan.chen@intel.com>	2025-02-27 09:25:49 +08:00
dependabot[bot]	d46df4331d	Bump gradio from 5.5.0 to 5.11.0 in /DocSum/ui/gradio (#1576 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Liang Lv <liang1.lv@intel.com>	2025-02-25 14:32:03 +08:00
Eero Tamminen	23a77df302	Fix "OpenAI" & "response" spelling (#1561 )	2025-02-25 12:45:21 +08:00
Ying Hu	852bc7027c	Update README.md of AIPC quick start (#1578 ) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>	2025-02-23 17:38:27 +08:00
minmin-intel	a7eced4161	Update AgentQnA and DocIndexRetriever (#1564 ) Signed-off-by: minmin-intel <minmin.hou@intel.com>	2025-02-22 09:51:26 +08:00
ZePan110	caec354324	Fix trivy issue (#1569 ) Fix docker image security issue Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-02-20 14:41:52 +08:00
xiguiw	d482554a6b	Fix mismatched environment variable (#1575 ) Signed-off-by: Wang, Xigui <xigui.wang@intel.com>	2025-02-19 19:24:10 +08:00
xiguiw	2ae6871fc5	Simplify ChatQnA AIPC user setting (#1573 ) Signed-off-by: Wang, Xigui <xigui.wang@intel.com>	2025-02-19 16:30:02 +08:00
dependabot[bot]	2ac5be9921	Bump gradio from 5.5.0 to 5.11.0 in /MultimodalQnA/ui/gradio (#1391 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: chen, suyue <suyue.chen@intel.com>	2025-02-19 15:58:46 +08:00
ZePan110	799881a3fa	Remove perf test code from test scripts. (#1510 ) Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-02-18 16:23:49 +08:00
jotpalch	e5c6418c81	Fix minor typo in README (#1559 ) Change Docker Compost<br/>Deployment on ROCm to Docker Compose<br/>Deployment on ROCm	2025-02-17 12:07:31 +08:00
xiguiw	0c0edffc5b	update vLLM CPU to the latest stable version (#1546 ) Signed-off-by: Wang, Xigui <xigui.wang@intel.com> Co-authored-by: chen, suyue <suyue.chen@intel.com>	2025-02-17 08:26:25 +08:00
Spycsh	9f36e84c1c	Refactor AudioQnA README (#1508 ) Signed-off-by: Spycsh <sihan.chen@intel.com>	2025-02-15 11:30:16 +08:00
chen, suyue	8c547c2ba5	Expand CI test scope for common test scripts (#1554 ) Expand CI test scope, trigger all hw test when the common test scripts changed. Signed-off-by: chensuyue <suyue.chen@intel.com>	2025-02-14 18:17:03 +08:00
Kendall González León	80dd86f122	Make a fix in the main README.md of the ChatQnA. (#1551 ) Signed-off-by: Kendall González León <kendall.gonzalez.leon@intel.com>	2025-02-14 17:00:44 +08:00
ZePan110	6d781f7b2b	Fix CICD workflow strategy running condition (#1533 ) Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-02-13 16:10:00 +08:00
WenjiaoYue	abafd5de20	Update UI of the three demos: faqGen, VisualQnA, and DocSum. (#1528 ) Signed-off-by: WenjiaoYue <wenjiao.yue@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>	2025-02-12 15:57:51 +08:00
Louie Tsai	970b869838	Add a new section to change LLM model such as deepseek based on validated model table in LLM microservice (#1501 ) Signed-off-by: Tsai, Louie <louie.tsai@intel.com> Co-authored-by: Wang, Kai Lawrence <109344418+wangkl2@users.noreply.github.com> Co-authored-by: xiguiw <111278656+xiguiw@users.noreply.github.com>	2025-02-12 09:34:56 +08:00
XinyaoWa	87ff149f61	Remove vllm hpu triton version fix (#1515 ) vllm-fork has fix triton version issue, remove duplicated code https://github.com/HabanaAI/vllm-fork/blob/habana_main/requirements-hpu.txt Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> Co-authored-by: chen, suyue <suyue.chen@intel.com>	2025-02-12 09:24:38 +08:00
chen, suyue	c39a569ab2	Update workflow condition and env (#1522 ) Signed-off-by: chensuyue <suyue.chen@intel.com>	2025-02-12 09:08:22 +08:00
chen, suyue	81b02bb947	Revert "HUGGINGFACEHUB_API_TOKEN environment is change to HF_TOKEN (#… (#1521 ) Revert this PR since the test is not triggered properly due to the false merge of a WIP CI PR, `44a689b0bf`, which block the CI test. This change will be submitted in another PR.	2025-02-11 18:36:12 +08:00
Louie Tsai	47069ac70c	fix a test script issue due to name change for telemetry yaml files (#1516 ) Signed-off-by: Tsai, Louie <louie.tsai@intel.com>	2025-02-11 17:58:42 +08:00
chen, suyue	6ce7730863	Update CI/CD workflow (#1520 ) 1. Update auto commit account. 2. Fix test condition. Signed-off-by: chensuyue <suyue.chen@intel.com> Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-02-11 17:48:37 +08:00
Louie Tsai	ad5523bac7	Enable OpenTelemtry Tracing for ChatQnA on Xeon and Gaudi by docker compose merge feature (#1488 ) Signed-off-by: Louie, Tsai <louie.tsai@intel.com> Signed-off-by: Tsai, Louie <louie.tsai@intel.com>	2025-02-10 22:58:50 -08:00
Louie Tsai	88a8235f21	Update README.md for Agent UI (#1495 ) Signed-off-by: Tsai, Louie <louie.tsai@intel.com>	2025-02-10 22:22:55 -08:00
ZePan110	63ad850052	Update docker image list (#1513 ) Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-02-11 13:18:22 +08:00
ZePan110	9a0c547112	Fix publish issue (#1514 ) Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-02-11 11:43:00 +08:00
ZePan110	26a6da4123	Fix nightly triggered exceptions (#1505 ) Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-02-10 16:51:34 +08:00
xiguiw	45d5da2ddd	HUGGINGFACEHUB_API_TOKEN environment is change to HF_TOKEN (#1503 ) Signed-off-by: Wang, Xigui <xigui.wang@intel.com>	2025-02-09 20:33:06 +08:00
xiguiw	1b3291a1c8	Fix docker compose.yaml error (#1496 ) Signed-off-by: Wang, Xigui <xigui.wang@intel.com>	2025-02-07 09:53:20 +08:00
ZePan110	7ac8cf517a	Restore test code. (#1502 ) Remove nightly test code. Signed-off-by: ZePan110 <ze.pan@intel.com>	2025-02-07 09:50:21 +08:00
ZePan110	44a689b0bf	Fix null value_file judgment (#1470 ) Signed-off-by: ZePan110 <ze.pan@intel.com> Co-authored-by: Malini Bhandaru <malini.bhandaru@intel.com>	2025-02-06 17:09:01 +08:00
xiguiw	388d3eb5c5	[Doc] Clean empty document (#1497 ) Signed-off-by: Wang, Xigui <xigui.wang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>	2025-02-06 10:53:25 +08:00
chyundunovDatamonsters	ef9ad61440	DBQnA - Adding files to deploy DBQnA application on AMD GPU (#1273 ) Signed-off-by: Chingis Yundunov <YundunovCN@sibedge.com> Co-authored-by: Chingis Yundunov <YundunovCN@sibedge.com> Co-authored-by: Malini Bhandaru <malini.bhandaru@intel.com>	2025-02-06 09:41:59 +08:00
Louie Tsai	4c41a5db83	Update README.md for OPEA OTLP tracing (#1406 ) Signed-off-by: louie-tsai <louie.tsai@intel.com> Signed-off-by: Tsai, Louie <louie.tsai@intel.com> Co-authored-by: Eero Tamminen <eero.t.tamminen@intel.com>	2025-02-05 13:03:15 -08:00
Liang Lv	9adf7a6af0	Add support for latest deepseek models on Gaudi (#1491 ) Signed-off-by: lvliang-intel <liang1.lv@intel.com>	2025-02-05 08:30:04 +08:00
chen, suyue	a4d028e8ea	update image release workflow (#1303 ) Signed-off-by: chensuyue <suyue.chen@intel.com> Co-authored-by: Malini Bhandaru <malini.bhandaru@intel.com>	2025-02-03 17:07:07 -08:00
Omar Khleif	32d4f714fd	Fix for NLTK related import failure (#1487 ) Signed-off-by: okhleif-IL <omar.khleif@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>	2025-02-01 10:04:37 +08:00
chyundunovDatamonsters	fdbc27a9b5	AvatarChatbot - Adding files to deploy AvatarChatbot application on AMD GPU (#1288 ) Signed-off-by: Chingis Yundunov <YundunovCN@sibedge.com>	2025-01-27 11:30:52 +08:00
XinyuYe-Intel	5f4b1828a5	Added UT for rerank finetuning on Gaudi (#1472 ) Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>	2025-01-27 11:24:05 +08:00
chyundunovDatamonsters	39abef8be8	SearchQnA App - Adding files to deploy SearchQnA application on AMD GPU (#1193 ) Signed-off-by: Chingis Yundunov <YundunovCN@sibedge.com>	2025-01-27 10:58:55 +08:00
bjzhjing	ed163087ba	Provide unified scalable deployment and benchmarking support for exam… (#1315 ) Signed-off-by: Cathy Zhang <cathy.zhang@intel.com> Signed-off-by: letonghan <letong.han@intel.com> Co-authored-by: letonghan <letong.han@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>	2025-01-24 22:27:49 +08:00