Sync value files from GenAIInfra (#1428)

All gaudi values updated with extra flags. Added helm support for 2 new examples Text2Image and SearchQnA. Minor fix for llm-uservice. Signed-off-by: Dolpher Du <dolpher.du@intel.com>
2025-01-22 17:44:11 +08:00
parent 5c36443b11
commit ee0e5cc8d9
34 changed files with 343 additions and 1487 deletions
--- a/AgentQnA/kubernetes/helm/gaudi-values.yaml
+++ b/AgentQnA/kubernetes/helm/gaudi-values.yaml
@@ -4,35 +4,13 @@
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values

-tgi:
+vllm:
  enabled: true
-  accelDevice: "gaudi"
  image:
-    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
-  resources:
-    limits:
-      habana.ai/gaudi: 4
-  MAX_INPUT_LENGTH: "4096"
-  MAX_TOTAL_TOKENS: "8192"
-  CUDA_GRAPHS: ""
-  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
-  ENABLE_HPU_GRAPH: "true"
-  LIMIT_HPU_GRAPH: "true"
-  USE_FLASH_ATTENTION: "true"
-  FLASH_ATTENTION_RECOMPUTE: "true"
-  extraCmdArgs: ["--sharded","true","--num-shard","4"]
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
+    repository: opea/vllm-gaudi
+supervisor:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+ragagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+sqlagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
--- a/AudioQnA/kubernetes/helm/gaudi-values.yaml
+++ b/AudioQnA/kubernetes/helm/gaudi-values.yaml
@@ -5,7 +5,7 @@ tgi:
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
@@ -33,11 +33,15 @@ tgi:
    failureThreshold: 120

 whisper:
+  image:
+    repository: opea/whisper-gaudi
  resources:
    limits:
      habana.ai/gaudi: 1

 speecht5:
+  image:
+    repository: opea/speecht5-gaudi
  resources:
    limits:
      habana.ai/gaudi: 1
--- a/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
+++ b/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
@@ -0,0 +1,112 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Override CPU resource request and probe timing values in specific subcharts
+#
+# RESOURCES
+#
+# Resource request matching actual resource usage (with enough slack)
+# is important when service is scaled up, so that right amount of pods
+# get scheduled to right nodes.
+#
+# Because resource usage depends on the used devices, model, data type
+# and SW versions, and this top-level chart has overrides for them,
+# resource requests need to be specified here too.
+#
+# To test service without resource request, use "resources: {}".
+#
+# PROBES
+#
+# Inferencing pods startup / warmup takes *much* longer on CPUs than
+# with acceleration devices, and their responses are also slower,
+# especially when node is running several instances of these services.
+#
+# Kubernetes restarting pod before its startup finishes, or not
+# sending it queries because it's not in ready state due to slow
+# readiness responses, does really NOT help in getting faster responses.
+#
+# => probe timings need to be increased when running on CPU.
+
+vllm:
+  enabled: false
+tgi:
+  enabled: true
+  # TODO: add Helm value also for TGI data type option:
+  # https://github.com/opea-project/GenAIExamples/issues/330
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+
+  # Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
+  #resources:
+  #  limits:
+  #    cpu: 8
+  #    memory: 70Gi
+  #  requests:
+  #    cpu: 6
+  #    memory: 65Gi
+
+  livenessProbe:
+    initialDelaySeconds: 8
+    periodSeconds: 8
+    failureThreshold: 24
+    timeoutSeconds: 4
+  readinessProbe:
+    initialDelaySeconds: 16
+    periodSeconds: 8
+    timeoutSeconds: 4
+  startupProbe:
+    initialDelaySeconds: 10
+    periodSeconds: 5
+    failureThreshold: 180
+    timeoutSeconds: 2
+
+teirerank:
+  RERANK_MODEL_ID: "BAAI/bge-reranker-base"
+
+  # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
+  resources:
+    limits:
+      cpu: 4
+      memory: 30Gi
+    requests:
+      cpu: 2
+      memory: 25Gi
+
+  livenessProbe:
+    initialDelaySeconds: 8
+    periodSeconds: 8
+    failureThreshold: 24
+    timeoutSeconds: 4
+  readinessProbe:
+    initialDelaySeconds: 8
+    periodSeconds: 8
+    timeoutSeconds: 4
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 120
+
+tei:
+  EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
+
+  # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
+  resources:
+    limits:
+      cpu: 4
+      memory: 4Gi
+    requests:
+      cpu: 2
+      memory: 3Gi
+
+  livenessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 24
+    timeoutSeconds: 2
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 2
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 120
--- a/ChatQnA/kubernetes/helm/cpu-values.yaml
+++ b/ChatQnA/kubernetes/helm/cpu-values.yaml
@@ -1,109 +1,5 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

-# Override CPU resource request and probe timing values in specific subcharts
-#
-# RESOURCES
-#
-# Resource request matching actual resource usage (with enough slack)
-# is important when service is scaled up, so that right amount of pods
-# get scheduled to right nodes.
-#
-# Because resource usage depends on the used devices, model, data type
-# and SW versions, and this top-level chart has overrides for them,
-# resource requests need to be specified here too.
-#
-# To test service without resource request, use "resources: {}".
-#
-# PROBES
-#
-# Inferencing pods startup / warmup takes *much* longer on CPUs than
-# with acceleration devices, and their responses are also slower,
-# especially when node is running several instances of these services.
-#
-# Kubernetes restarting pod before its startup finishes, or not
-# sending it queries because it's not in ready state due to slow
-# readiness responses, does really NOT help in getting faster responses.
-#
-# => probe timings need to be increased when running on CPU.
-
-tgi:
-  # TODO: add Helm value also for TGI data type option:
-  # https://github.com/opea-project/GenAIExamples/issues/330
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
-
-  # Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
-  resources:
-    limits:
-      cpu: 8
-      memory: 70Gi
-    requests:
-      cpu: 6
-      memory: 65Gi
-
-  livenessProbe:
-    initialDelaySeconds: 8
-    periodSeconds: 8
-    failureThreshold: 24
-    timeoutSeconds: 4
-  readinessProbe:
-    initialDelaySeconds: 16
-    periodSeconds: 8
-    timeoutSeconds: 4
-  startupProbe:
-    initialDelaySeconds: 10
-    periodSeconds: 5
-    failureThreshold: 180
-    timeoutSeconds: 2
-
-teirerank:
-  RERANK_MODEL_ID: "BAAI/bge-reranker-base"
-
-  # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
-  resources:
-    limits:
-      cpu: 4
-      memory: 30Gi
-    requests:
-      cpu: 2
-      memory: 25Gi
-
-  livenessProbe:
-    initialDelaySeconds: 8
-    periodSeconds: 8
-    failureThreshold: 24
-    timeoutSeconds: 4
-  readinessProbe:
-    initialDelaySeconds: 8
-    periodSeconds: 8
-    timeoutSeconds: 4
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    failureThreshold: 120
-
-tei:
-  EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
-
-  # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
-  resources:
-    limits:
-      cpu: 4
-      memory: 4Gi
-    requests:
-      cpu: 2
-      memory: 3Gi
-
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    failureThreshold: 24
-    timeoutSeconds: 2
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 2
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    failureThreshold: 120
+image:
+  repository: opea/chatqna
--- a/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
+++ b/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
@@ -4,12 +4,15 @@
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values

+vllm:
+  enabled: false
 # TGI: largest bottleneck for ChatQnA
 tgi:
+  enabled: true
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
--- a/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml
+++ b/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml
@@ -6,9 +6,9 @@

 tgi:
  enabled: false
-
 vllm:
  enabled: true
+  shmSize: 1Gi
  accelDevice: "gaudi"
  image:
    repository: opea/vllm-gaudi
@@ -19,7 +19,7 @@ vllm:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
-    failureThreshold: 120
+    failureThreshold: 180
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
@@ -39,7 +39,6 @@ vllm:
    "--max-seq_len-to-capture", "2048"
  ]

-
 # Reranking: second largest bottleneck when reranking is in use
 # (i.e. query context docs have been uploaded with data-prep)
 #
--- a/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
+++ b/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
@@ -44,44 +44,13 @@ teirerank:
  readinessProbe:
    timeoutSeconds: 1

-tgi:
-  accelDevice: "gaudi"
-  image:
-    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
-  resources:
-    limits:
-      habana.ai/gaudi: 1
-  # higher limits are needed with extra input tokens added by rerank
-  MAX_INPUT_LENGTH: "2048"
-  MAX_TOTAL_TOKENS: "4096"
-  CUDA_GRAPHS: ""
-  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-  ENABLE_HPU_GRAPH: "true"
-  LIMIT_HPU_GRAPH: "true"
-  USE_FLASH_ATTENTION: "true"
-  FLASH_ATTENTION_RECOMPUTE: "true"
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
-
 tgi-guardrails:
  enabled: true
  accelDevice: "gaudi"
  LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
@@ -106,3 +75,38 @@ tgi-guardrails:
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
+
+tgi:
+  enabled: false
+vllm:
+  enabled: true
+  shmSize: 1Gi
+  accelDevice: "gaudi"
+  image:
+    repository: opea/vllm-gaudi
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 180
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  livenessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+
+  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+
+  extraCmdArgs: [
+    "--tensor-parallel-size", "1",
+    "--block-size", "128",
+    "--max-num-seqs", "256",
+    "--max-seq_len-to-capture", "2048"
+  ]
--- a/ChatQnA/kubernetes/helm/guardrails-values.yaml
+++ b/ChatQnA/kubernetes/helm/guardrails-values.yaml
@@ -1,14 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-image:
-  repository: opea/chatqna-guardrails
-
-# guardrails related config
-guardrails-usvc:
-  enabled: true
-  # SAFETY_GUARD_ENDPOINT: "http://{{ .Release.Name }}-tgi-guardrails"
-  SAFETY_GUARD_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
-tgi-guardrails:
-  enabled: true
-  LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
--- a/ChatQnA/kubernetes/helm/nv-values.yaml
+++ b/ChatQnA/kubernetes/helm/nv-values.yaml
@@ -1,25 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# To override values in subchart tgi
-tgi:
-  accelDevice: "nvidia"
-  image:
-    repository: ghcr.io/huggingface/text-generation-inference
-    tag: "2.2.0"
-  resources:
-    limits:
-      nvidia.com/gpu: 1
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
--- a/ChatQnA/tests/test_manifest_guardrails_on_gaudi.sh
+++ b/ChatQnA/tests/test_manifest_guardrails_on_gaudi.sh
@@ -1,117 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-ROLLOUT_TIMEOUT_SECONDS="1800s"
-KUBECTL_TIMEOUT_SECONDS="60s"
-
-function validate_chatqna() {
-    local ns=$1
-    local log=$2
-    max_retry=20
-    # make sure microservice retriever-usvc is ready
-    # try to curl retriever-svc for max_retry times
-    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
-        curl http://$endpoint_url/v1/retrieval -X POST \
-            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice retriever failed, exit with error."
-        return 1
-    fi
-    # make sure microservice tgi-svc is ready
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
-        curl http://$endpoint_url/generate -X POST \
-            -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice tgi failed, exit with error."
-        return 1
-    fi
-
-    # check megaservice works
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$log.log
-    endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
-    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice failed, please check the logs in $LOGFILE!"
-        return ${exit_code}
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] &&
-        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-    if [[ $status == false ]]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        return 1
-    else
-        echo "Response check succeed!"
-    fi
-    return 0
-}
-
-function install_chatqna() {
-    echo "Testing manifests chatqna_guardrails"
-    local ns=$1
-    bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
-    pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
-    kubectl create namespace $ns
-    # install guardrails
-    kubectl apply -f chatqna-guardrails.yaml -n $ns
-    # Sleep enough time for chatqna_guardrails to be ready
-    sleep 60
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_ChatQnA)
-        pushd ChatQnA/tests/common
-        bash _test_manifest_utils.sh init_ChatQnA
-        popd
-        ;;
-    install_ChatQnA)
-        NAMESPACE=$2
-        install_chatqna $NAMESPACE
-        popd
-        ;;
-    validate_ChatQnA)
-        NAMESPACE=$2
-        SERVICE_NAME=chatqna-guardrails
-        validate_chatqna $NAMESPACE chatqna-guardrails
-        ret=$?
-        if [ $ret -ne 0 ]; then
-            exit $ret
-        fi
-        ;;
-
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
--- a/ChatQnA/tests/test_manifest_guardrails_on_xeon.sh
+++ b/ChatQnA/tests/test_manifest_guardrails_on_xeon.sh
@@ -1,117 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-ROLLOUT_TIMEOUT_SECONDS="1800s"
-KUBECTL_TIMEOUT_SECONDS="60s"
-
-function validate_chatqna() {
-    local ns=$1
-    local log=$2
-    max_retry=10
-    # make sure microservice retriever-usvc is ready
-    # try to curl retriever-svc for max_retry times
-    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
-        curl http://$endpoint_url/v1/retrieval -X POST \
-            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice retriever failed, exit with error."
-        return 1
-    fi
-    # make sure microservice tgi-svc is ready
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
-        curl http://$endpoint_url/generate -X POST \
-            -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice tgi failed, exit with error."
-        return 1
-    fi
-
-    # check megaservice works
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$log.log
-    endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
-    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice failed, please check the logs in $LOGFILE!"
-        return ${exit_code}
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] &&
-        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        return 1
-    else
-        echo "Response check succeed!"
-    fi
-    return 0
-}
-
-function install_chatqna() {
-    echo "Testing manifests chatqna_guardrails"
-    local ns=$1
-    bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
-    pushd ChatQnA/kubernetes/intel/cpu/xeon/manifest
-    kubectl create namespace $ns
-    # install guardrail
-    kubectl apply -f chatqna-guardrails.yaml -n $ns
-    # Sleep enough time for chatqna_guardrails to be ready
-    sleep 60
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_ChatQnA)
-        pushd ChatQnA/tests/common
-        bash _test_manifest_utils.sh init_ChatQnA
-        popd
-        ;;
-    install_ChatQnA)
-        NAMESPACE=$2
-        install_chatqna $NAMESPACE
-        popd
-        ;;
-    validate_ChatQnA)
-        NAMESPACE=$2
-        SERVICE_NAME=chatqna-guardrails
-        validate_chatqna $NAMESPACE chatqna-guardrails
-        ret=$?
-        if [ $ret -ne 0 ]; then
-            exit $ret
-        fi
-        ;;
-
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
--- a/ChatQnA/tests/test_manifest_on_gaudi.sh
+++ b/ChatQnA/tests/test_manifest_on_gaudi.sh
@@ -1,113 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-ROLLOUT_TIMEOUT_SECONDS="1800s"
-KUBECTL_TIMEOUT_SECONDS="60s"
-
-function install_chatqna {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f chatqna.yaml -n $NAMESPACE
-    # Sleep enough time for retreiver-usvc to be ready
-    sleep 60
-}
-
-function validate_chatqna() {
-    local ns=$1
-    local log=$2
-    max_retry=20
-    # make sure microservice retriever-usvc is ready
-    # try to curl retriever-svc for max_retry times
-    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
-        curl http://$endpoint_url/v1/retrieval -X POST \
-            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice retriever failed, exit with error."
-        return 1
-    fi
-    # make sure microservice tgi-svc is ready
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
-        curl http://$endpoint_url/generate -X POST \
-            -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice tgi failed, exit with error."
-        return 1
-    fi
-
-    # check megaservice works
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$log.log
-    endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
-    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice failed, please check the logs in $LOGFILE!"
-        return ${exit_code}
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] &&
-        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        return 1
-    else
-        echo "Response check succeed!"
-    fi
-    return 0
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_ChatQnA)
-        pushd ChatQnA/tests/common
-        bash _test_manifest_utils.sh init_ChatQnA
-        popd
-        ;;
-    install_ChatQnA)
-        pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
-        NAMESPACE=$2
-        install_chatqna
-        popd
-        ;;
-    validate_ChatQnA)
-        NAMESPACE=$2
-        SERVICE_NAME=chatqna
-        validate_chatqna $NAMESPACE chatqna
-        ret=$?
-        if [ $ret -ne 0 ]; then
-            exit $ret
-        fi
-        ;;
-
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
--- a/ChatQnA/tests/test_manifest_on_xeon.sh
+++ b/ChatQnA/tests/test_manifest_on_xeon.sh
@@ -1,112 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-ROLLOUT_TIMEOUT_SECONDS="1800s"
-KUBECTL_TIMEOUT_SECONDS="60s"
-
-function install_chatqna {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f chatqna.yaml -n $NAMESPACE
-    # Sleep enough time for retreiver-usvc to be ready
-    sleep 60
-}
-
-function validate_chatqna() {
-    local ns=$1
-    local log=$2
-    max_retry=10
-    # make sure microservice retriever-usvc is ready
-    # try to curl retriever-svc for max_retry times
-    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
-        curl http://$endpoint_url/v1/retrieval -X POST \
-            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice retriever failed, exit with error."
-        return 1
-    fi
-    # make sure microservice tgi-svc is ready
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
-        curl http://$endpoint_url/generate -X POST \
-            -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice tgi failed, exit with error."
-        return 1
-    fi
-
-    # check megaservice works
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$log.log
-    endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
-    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice failed, please check the logs in $LOGFILE!"
-        return ${exit_code}
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] &&
-        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        return 1
-    else
-        echo "Response check succeed!"
-    fi
-    return 0
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_ChatQnA)
-        pushd ChatQnA/tests/common
-        bash _test_manifest_utils.sh init_ChatQnA
-        popd
-        ;;
-    install_ChatQnA)
-        pushd ChatQnA/kubernetes/intel/cpu/xeon/manifest
-        NAMESPACE=$2
-        install_chatqna
-        popd
-        ;;
-    validate_ChatQnA)
-        NAMESPACE=$2
-        SERVICE_NAME=chatqna
-        validate_chatqna $NAMESPACE chatqna
-        ret=$?
-        if [ $ret -ne 0 ]; then
-            exit $ret
-        fi
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
--- a/ChatQnA/tests/test_manifest_vllm_on_gaudi.sh
+++ b/ChatQnA/tests/test_manifest_vllm_on_gaudi.sh
@@ -1,118 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-ROLLOUT_TIMEOUT_SECONDS="1800s"
-KUBECTL_TIMEOUT_SECONDS="60s"
-
-function validate_chatqna() {
-    local ns=$1
-    local log=$2
-    max_retry=20
-    # make sure microservice retriever-usvc is ready
-    # try to curl retriever-svc for max_retry times
-    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point  "chatqna-retriever-usvc" $ns)
-        curl http://$endpoint_url/v1/retrieval -X POST \
-            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice retriever failed, exit with error."
-        return 1
-    fi
-
-    # make sure microservice vllm-svc is ready
-    for ((i=1; i<=max_retry; i++))
-    do
-        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point  "chatqna-vllm" $ns)
-        curl http://$endpoint_url/v1/chat/completions -X POST \
-            -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}' \
-            -H 'Content-Type: application/json' && break
-        sleep 30
-    done
-    # if i is bigger than max_retry, then exit with error
-    if [ $i -gt $max_retry ]; then
-        echo "Microservice vllm failed, exit with error."
-        return 1
-    fi
-
-    # check megaservice works
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$log.log
-    endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point  "chatqna" $ns)
-    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice failed, please check the logs in $LOGFILE!"
-        return ${exit_code}
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] &&
-        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        return 1
-    else
-        echo "Response check succeed!"
-    fi
-    return 0
-}
-
-function install_chatqna() {
-    echo "Testing manifests chatqna_vllm"
-    local ns=$1
-    bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
-    kubectl create namespace $ns
-    # install guardrail
-    pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
-    kubectl apply -f chatqna-vllm.yaml -n $ns
-    # Sleep enough time for chatqna_vllm to be ready, vllm warmup takes about 5 minutes
-    sleep 280
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_ChatQnA)
-        pushd ChatQnA/tests/common
-        bash _test_manifest_utils.sh init_ChatQnA
-        popd
-        ;;
-    install_ChatQnA)
-        NAMESPACE=$2
-        install_chatqna $NAMESPACE
-        popd
-        ;;
-    validate_ChatQnA)
-        NAMESPACE=$2
-        SERVICE_NAME=chatqna-vllm
-        validate_chatqna $NAMESPACE chatqna-vllm
-        ret=$?
-        if [ $ret -ne 0 ]; then
-            exit $ret
-        fi
-        ;;
-
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
--- a/CodeGen/kubernetes/helm/gaudi-values.yaml
+++ b/CodeGen/kubernetes/helm/gaudi-values.yaml
@@ -6,13 +6,18 @@ tgi:
  LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
  MAX_INPUT_LENGTH: "1024"
  MAX_TOTAL_TOKENS: "2048"
  CUDA_GRAPHS: ""
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
--- a/CodeGen/tests/test_manifest_on_gaudi.sh
+++ b/CodeGen/tests/test_manifest_on_gaudi.sh
@@ -1,85 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_codegen() {
-    # executed under path manifest/codegen/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_codegen {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f codegen.yaml -n $NAMESPACE
-}
-
-function validate_codegen() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/codegen..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/codegen -H "Content-Type: application/json" \
-    -d '{"messages": "def print_hello_world():"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice codegen failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        exit 1
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_CodeGen)
-        pushd CodeGen/kubernetes/intel/hpu/gaudi/manifest
-        init_codegen
-        popd
-        ;;
-    install_CodeGen)
-        pushd CodeGen/kubernetes/intel/hpu/gaudi/manifest
-        NAMESPACE=$2
-        install_codegen
-        popd
-        ;;
-    validate_CodeGen)
-        NAMESPACE=$2
-        SERVICE_NAME=codegen
-        validate_codegen
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
--- a/CodeGen/tests/test_manifest_on_xeon.sh
+++ b/CodeGen/tests/test_manifest_on_xeon.sh
@@ -1,85 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_codegen() {
-    # executed under path manifest/codegen/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_codegen {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f codegen.yaml -n $NAMESPACE
-}
-
-function validate_codegen() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/codegen..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/codegen -H "Content-Type: application/json" \
-    -d '{"messages": "def print_hello_world():"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice codegen failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        exit 1
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_CodeGen)
-        pushd CodeGen/kubernetes/intel/cpu/xeon/manifest
-        init_codegen
-        popd
-        ;;
-    install_CodeGen)
-        pushd CodeGen/kubernetes/intel/cpu/xeon/manifest
-        NAMESPACE=$2
-        install_codegen
-        popd
-        ;;
-    validate_CodeGen)
-        NAMESPACE=$2
-        SERVICE_NAME=codegen
-        validate_codegen
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
--- a/CodeTrans/kubernetes/helm/gaudi-values.yaml
+++ b/CodeTrans/kubernetes/helm/gaudi-values.yaml
@@ -5,13 +5,18 @@ tgi:
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
  MAX_INPUT_LENGTH: "1024"
  MAX_TOTAL_TOKENS: "2048"
  CUDA_GRAPHS: ""
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
--- a/CodeTrans/tests/test_manifest_on_gaudi.sh
+++ b/CodeTrans/tests/test_manifest_on_gaudi.sh
@@ -1,86 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_codetrans() {
-    # executed under path manifest/codetrans/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_codetrans {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f codetrans.yaml -n $NAMESPACE
-}
-
-function validate_codetrans() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/codetrans..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/codetrans \
-    -H 'Content-Type: application/json' \
-    -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice codetrans failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        exit 1
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_CodeTrans)
-        pushd CodeTrans/kubernetes/intel/hpu/gaudi/manifest
-        init_codetrans
-        popd
-        ;;
-    install_CodeTrans)
-        pushd CodeTrans/kubernetes/intel/hpu/gaudi/manifest
-        NAMESPACE=$2
-        install_codetrans
-        popd
-        ;;
-    validate_CodeTrans)
-        NAMESPACE=$2
-        SERVICE_NAME=codetrans
-        validate_codetrans
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
--- a/CodeTrans/tests/test_manifest_on_xeon.sh
+++ b/CodeTrans/tests/test_manifest_on_xeon.sh
@@ -1,86 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_codetrans() {
-    # executed under path manifest/codetrans/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_codetrans {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f codetrans.yaml -n $NAMESPACE
-}
-
-function validate_codetrans() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/codetrans..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/codetrans \
-    -H 'Content-Type: application/json' \
-    -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice codetrans failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        exit 1
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_CodeTrans)
-        pushd CodeTrans/kubernetes/intel/cpu/xeon/manifest
-        init_codetrans
-        popd
-        ;;
-    install_CodeTrans)
-        pushd CodeTrans/kubernetes/intel/cpu/xeon/manifest
-        NAMESPACE=$2
-        install_codetrans
-        popd
-        ;;
-    validate_CodeTrans)
-        NAMESPACE=$2
-        SERVICE_NAME=codetrans
-        validate_codetrans
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
--- a/DocSum/kubernetes/helm/cpu-values.yaml
+++ b/DocSum/kubernetes/helm/cpu-values.yaml
@@ -2,4 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0

 tgi:
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+  enabled: true
+vllm:
+  enabled: false
--- a/DocSum/kubernetes/helm/gaudi-values.yaml
+++ b/DocSum/kubernetes/helm/gaudi-values.yaml
@@ -1,16 +1,21 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

+vllm:
+  enabled: false
+
+llm-uservice:
+  DOCSUM_BACKEND: "TGI"
+
 tgi:
+  enabled: true
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "1024"
-  MAX_TOTAL_TOKENS: "2048"
  CUDA_GRAPHS: ""
  ENABLE_HPU_GRAPH: true
  LIMIT_HPU_GRAPH: true
--- a/DocSum/tests/test_manifest_on_gaudi.sh
+++ b/DocSum/tests/test_manifest_on_gaudi.sh
@@ -1,87 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_docsum() {
-    # executed under path manifest/docsum/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_docsum {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f docsum.yaml -n $NAMESPACE
-}
-
-function validate_docsum() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/docsum..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/docsum \
-    -H 'Content-Type: multipart/form-data' \
-    -F 'type=text' \
-    -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice docsum failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        exit 1
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_DocSum)
-        pushd DocSum/kubernetes/intel/hpu/gaudi/manifest
-        init_docsum
-        popd
-        ;;
-    install_DocSum)
-        pushd DocSum/kubernetes/intel/hpu/gaudi/manifest
-        NAMESPACE=$2
-        install_docsum
-        popd
-        ;;
-    validate_DocSum)
-        NAMESPACE=$2
-        SERVICE_NAME=docsum
-        validate_docsum
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
--- a/DocSum/tests/test_manifest_on_xeon.sh
+++ b/DocSum/tests/test_manifest_on_xeon.sh
@@ -1,87 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_docsum() {
-    # executed under path manifest/docsum/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/opea/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_docsum {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f docsum.yaml -n $NAMESPACE
-}
-
-function validate_docsum() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/docsum..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/docsum \
-    -H 'Content-Type: multipart/form-data' \
-    -F 'type=text' \
-    -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice docsum failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-        exit 1
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_DocSum)
-        pushd DocSum/kubernetes/intel/cpu/xeon/manifest
-        init_docsum
-        popd
-        ;;
-    install_DocSum)
-        pushd DocSum/kubernetes/intel/cpu/xeon/manifest
-        NAMESPACE=$2
-        install_docsum
-        popd
-        ;;
-    validate_DocSum)
-        NAMESPACE=$2
-        SERVICE_NAME=docsum
-        validate_docsum
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
--- a/FaqGen/kubernetes/helm/gaudi-values.yaml
+++ b/FaqGen/kubernetes/helm/gaudi-values.yaml
@@ -5,13 +5,25 @@ tgi:
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "4096"
-  MAX_TOTAL_TOKENS: "8192"
+  MAX_INPUT_LENGTH: "1024"
+  MAX_TOTAL_TOKENS: "2048"
  CUDA_GRAPHS: "0"
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
+  PREFILL_BATCH_BUCKET_SIZE: 1
+  BATCH_BUCKET_SIZE: 8
+  extraCmdArgs:
+    - "--max-batch-total-tokens"
+    - "65536"
+    - "--max-batch-prefill-tokens"
+    - "4096"
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
--- a/SearchQnA/kubernetes/helm/README.md
+++ b/SearchQnA/kubernetes/helm/README.md
@@ -0,0 +1,18 @@
+# Deploy SearchQnA on Kubernetes cluster
+
+- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information.
+- For more deploy options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme).
+
+## Deploy on Xeon
+
+```
+export HFTOKEN="insert-your-huggingface-token-here"
+helm install searchqna oci://ghcr.io/opea-project/charts/searchqna  --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml
+```
+
+## Deploy on Gaudi
+
+```
+export HFTOKEN="insert-your-huggingface-token-here"
+helm install searchqna oci://ghcr.io/opea-project/charts/searchqna  --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml
+```
--- a/SearchQnA/kubernetes/helm/cpu-values.yaml
+++ b/SearchQnA/kubernetes/helm/cpu-values.yaml
@@ -0,0 +1,7 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+llm_uservice:
+  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
--- a/SearchQnA/kubernetes/helm/gaudi-values.yaml
+++ b/SearchQnA/kubernetes/helm/gaudi-values.yaml
@@ -0,0 +1,50 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  accelDevice: "gaudi"
+  image:
+    repository: ghcr.io/huggingface/tgi-gaudi
+    tag: "2.3.1"
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+  MAX_INPUT_LENGTH: "2048"
+  MAX_TOTAL_TOKENS: "4096"
+  CUDA_GRAPHS: ""
+  HF_HUB_DISABLE_PROGRESS_BARS: 1
+  HF_HUB_ENABLE_HF_TRANSFER: 0
+  ENABLE_HPU_GRAPH: true
+  LIMIT_HPU_GRAPH: true
+  USE_FLASH_ATTENTION: true
+  FLASH_ATTENTION_RECOMPUTE: true
+  livenessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+
+tei:
+  accelDevice: "gaudi"
+  image:
+    repository: ghcr.io/huggingface/tei-gaudi
+    tag: "1.5.0"
+  OMPI_MCA_btl_vader_single_copy_mechanism: none
+  MAX_WARMUP_SEQUENCE_LENGTH: 512
+  securityContext:
+    readOnlyRootFilesystem: false
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+  livenessProbe:
+    timeoutSeconds: 1
+  readinessProbe:
+    timeoutSeconds: 1
--- a/Text2Image/kubernetes/helm/README.md
+++ b/Text2Image/kubernetes/helm/README.md
@@ -0,0 +1,18 @@
+# Deploy txt2img on Kubernetes cluster
+
+- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information.
+- For more deploy options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme).
+
+## Deploy on Xeon
+
+```
+export HFTOKEN="insert-your-huggingface-token-here"
+helm install txt2img oci://ghcr.io/opea-project/charts/txt2img  --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml
+```
+
+## Deploy on Gaudi
+
+```
+export HFTOKEN="insert-your-huggingface-token-here"
+helm install txt2img oci://ghcr.io/opea-project/charts/txt2img  --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml
+```
--- a/Text2Image/kubernetes/helm/cpu-values.yaml
+++ b/Text2Image/kubernetes/helm/cpu-values.yaml
@@ -0,0 +1,6 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+text2image:
+  image:
+    repository: opea/text2image
--- a/Text2Image/kubernetes/helm/gaudi-values.yaml
+++ b/Text2Image/kubernetes/helm/gaudi-values.yaml
@@ -0,0 +1,30 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+text2image:
+  accelDevice: "gaudi"
+  image:
+    repository: opea/text2image-gaudi
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+      # The following hugepage related settings is for default MODEL stable-diffusion-v1-5/stable-diffusion-v1-5
+      # User should change the resource limits for other models
+      hugepages-2Mi: 256Mi
+  volumes:
+    - name: hugepage-2mi
+      emptyDir:
+        medium: HugePages-2Mi
+  volumeMounts:
+    - name: hugepage-2mi
+      mountPath: /hugepages-2Mi
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
--- a/Translation/tests/test_manifest_on_gaudi.sh
+++ b/Translation/tests/test_manifest_on_gaudi.sh
@@ -1,86 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_translation() {
-    # executed under path manifest/translation/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_translation {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f translation.yaml -n $NAMESPACE
-    sleep 50s
-}
-
-function validate_translation() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/translation..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/translation \
-    -H 'Content-Type: application/json' \
-    -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice translation failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "translation" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_Translation)
-        pushd Translation/kubernetes/intel/hpu/gaudi/manifest
-        init_translation
-        popd
-        ;;
-    install_Translation)
-        pushd Translation/kubernetes/intel/hpu/gaudi/manifest
-        NAMESPACE=$2
-        install_translation
-        popd
-        ;;
-    validate_Translation)
-        NAMESPACE=$2
-        SERVICE_NAME=translation
-        validate_translation
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
--- a/Translation/tests/test_manifest_on_xeon.sh
+++ b/Translation/tests/test_manifest_on_xeon.sh
@@ -1,85 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -xe
-USER_ID=$(whoami)
-LOG_PATH=/home/$(whoami)/logs
-MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
-IMAGE_REPO=${IMAGE_REPO:-opea}
-IMAGE_TAG=${IMAGE_TAG:-latest}
-
-function init_translation() {
-    # executed under path manifest/translation/xeon
-    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
-    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
-    # replace microservice image tag
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
-    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
-    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
-    # set huggingface token
-    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
-}
-
-function install_translation {
-    echo "namespace is $NAMESPACE"
-    kubectl apply -f translation.yaml -n $NAMESPACE
-}
-
-function validate_translation() {
-    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
-    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
-    echo "try to curl http://${ip_address}:${port}/v1/translation..."
-
-    # generate a random logfile name to avoid conflict among multiple runners
-    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
-    # Curl the Mega Service
-    curl http://${ip_address}:${port}/v1/translation \
-    -H 'Content-Type: application/json' \
-    -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' > $LOGFILE
-    exit_code=$?
-    if [ $exit_code -ne 0 ]; then
-        echo "Megaservice translation failed, please check the logs in $LOGFILE!"
-        exit 1
-    fi
-
-    echo "Checking response results, make sure the output is reasonable. "
-    local status=false
-    if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "translation" $LOGFILE) != 0 ]]; then
-        status=true
-    fi
-
-    if [ $status == false ]; then
-        echo "Response check failed, please check the logs in artifacts!"
-    else
-        echo "Response check succeed!"
-    fi
-}
-
-if [ $# -eq 0 ]; then
-    echo "Usage: $0 <function_name>"
-    exit 1
-fi
-
-case "$1" in
-    init_Translation)
-        pushd Translation/kubernetes/intel/cpu/xeon/manifest
-        init_translation
-        popd
-        ;;
-    install_Translation)
-        pushd Translation/kubernetes/intel/cpu/xeon/manifest
-        NAMESPACE=$2
-        install_translation
-        popd
-        ;;
-    validate_Translation)
-        NAMESPACE=$2
-        SERVICE_NAME=translation
-        validate_translation
-        ;;
-    *)
-        echo "Unknown function: $1"
-        ;;
-esac
--- a/VisualQnA/kubernetes/helm/gaudi-values.yaml
+++ b/VisualQnA/kubernetes/helm/gaudi-values.yaml
@@ -9,13 +9,18 @@ tgi:
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
  MAX_INPUT_LENGTH: "4096"
  MAX_TOTAL_TOKENS: "8192"
  CUDA_GRAPHS: ""
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5