Sync value files from GenAIInfra (#1428)

All gaudi values updated with extra flags. Added helm support for 2 new examples Text2Image and SearchQnA. Minor fix for llm-uservice. Signed-off-by: Dolpher Du <dolpher.du@intel.com>
2025-01-22 17:44:11 +08:00
parent 5c36443b11
commit ee0e5cc8d9
34 changed files with 343 additions and 1487 deletions
--- a/AgentQnA/kubernetes/helm/gaudi-values.yaml
+++ b/AgentQnA/kubernetes/helm/gaudi-values.yaml
@@ -4,35 +4,13 @@
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values
-tgi:
+vllm:
  enabled: true
  accelDevice: "gaudi"
  image:
-    repository: ghcr.io/huggingface/tgi-gaudi
+    repository: opea/vllm-gaudi
-    tag: "2.0.6"
+supervisor:
-  resources:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
-    limits:
+ragagent:
-      habana.ai/gaudi: 4
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
-  MAX_INPUT_LENGTH: "4096"
+sqlagent:
-  MAX_TOTAL_TOKENS: "8192"
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
  CUDA_GRAPHS: ""
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
  ENABLE_HPU_GRAPH: "true"
  LIMIT_HPU_GRAPH: "true"
  USE_FLASH_ATTENTION: "true"
  FLASH_ATTENTION_RECOMPUTE: "true"
  extraCmdArgs: ["--sharded","true","--num-shard","4"]
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
--- a/AudioQnA/kubernetes/helm/gaudi-values.yaml
+++ b/AudioQnA/kubernetes/helm/gaudi-values.yaml
@@ -5,7 +5,7 @@ tgi:
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
@@ -33,11 +33,15 @@ tgi:
    failureThreshold: 120
 whisper:
  image:
    repository: opea/whisper-gaudi
  resources:
    limits:
      habana.ai/gaudi: 1
 speecht5:
  image:
    repository: opea/speecht5-gaudi
  resources:
    limits:
      habana.ai/gaudi: 1
--- a/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
+++ b/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
@@ -0,0 +1,112 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 # Override CPU resource request and probe timing values in specific subcharts
 #
 # RESOURCES
 #
 # Resource request matching actual resource usage (with enough slack)
 # is important when service is scaled up, so that right amount of pods
 # get scheduled to right nodes.
 #
 # Because resource usage depends on the used devices, model, data type
 # and SW versions, and this top-level chart has overrides for them,
 # resource requests need to be specified here too.
 #
 # To test service without resource request, use "resources: {}".
 #
 # PROBES
 #
 # Inferencing pods startup / warmup takes *much* longer on CPUs than
 # with acceleration devices, and their responses are also slower,
 # especially when node is running several instances of these services.
 #
 # Kubernetes restarting pod before its startup finishes, or not
 # sending it queries because it's not in ready state due to slow
 # readiness responses, does really NOT help in getting faster responses.
 #
 # => probe timings need to be increased when running on CPU.
 vllm:
  enabled: false
 tgi:
  enabled: true
  # TODO: add Helm value also for TGI data type option:
  # https://github.com/opea-project/GenAIExamples/issues/330
  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
  # Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
  #resources:
  #  limits:
  #    cpu: 8
  #    memory: 70Gi
  #  requests:
  #    cpu: 6
  #    memory: 65Gi
  livenessProbe:
    initialDelaySeconds: 8
    periodSeconds: 8
    failureThreshold: 24
    timeoutSeconds: 4
  readinessProbe:
    initialDelaySeconds: 16
    periodSeconds: 8
    timeoutSeconds: 4
  startupProbe:
    initialDelaySeconds: 10
    periodSeconds: 5
    failureThreshold: 180
    timeoutSeconds: 2
 teirerank:
  RERANK_MODEL_ID: "BAAI/bge-reranker-base"
  # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
  resources:
    limits:
      cpu: 4
      memory: 30Gi
    requests:
      cpu: 2
      memory: 25Gi
  livenessProbe:
    initialDelaySeconds: 8
    periodSeconds: 8
    failureThreshold: 24
    timeoutSeconds: 4
  readinessProbe:
    initialDelaySeconds: 8
    periodSeconds: 8
    timeoutSeconds: 4
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    failureThreshold: 120
 tei:
  EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
  # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
  resources:
    limits:
      cpu: 4
      memory: 4Gi
    requests:
      cpu: 2
      memory: 3Gi
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    failureThreshold: 24
    timeoutSeconds: 2
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 2
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    failureThreshold: 120
--- a/ChatQnA/kubernetes/helm/cpu-values.yaml
+++ b/ChatQnA/kubernetes/helm/cpu-values.yaml
@@ -1,109 +1,5 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-# Override CPU resource request and probe timing values in specific subcharts
+image:
-#
+  repository: opea/chatqna
 # RESOURCES
 #
 # Resource request matching actual resource usage (with enough slack)
 # is important when service is scaled up, so that right amount of pods
 # get scheduled to right nodes.
 #
 # Because resource usage depends on the used devices, model, data type
 # and SW versions, and this top-level chart has overrides for them,
 # resource requests need to be specified here too.
 #
 # To test service without resource request, use "resources: {}".
 #
 # PROBES
 #
 # Inferencing pods startup / warmup takes *much* longer on CPUs than
 # with acceleration devices, and their responses are also slower,
 # especially when node is running several instances of these services.
 #
 # Kubernetes restarting pod before its startup finishes, or not
 # sending it queries because it's not in ready state due to slow
 # readiness responses, does really NOT help in getting faster responses.
 #
 # => probe timings need to be increased when running on CPU.
 tgi:
  # TODO: add Helm value also for TGI data type option:
  # https://github.com/opea-project/GenAIExamples/issues/330
  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
  # Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit:
  resources:
    limits:
      cpu: 8
      memory: 70Gi
    requests:
      cpu: 6
      memory: 65Gi
  livenessProbe:
    initialDelaySeconds: 8
    periodSeconds: 8
    failureThreshold: 24
    timeoutSeconds: 4
  readinessProbe:
    initialDelaySeconds: 16
    periodSeconds: 8
    timeoutSeconds: 4
  startupProbe:
    initialDelaySeconds: 10
    periodSeconds: 5
    failureThreshold: 180
    timeoutSeconds: 2
 teirerank:
  RERANK_MODEL_ID: "BAAI/bge-reranker-base"
  # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
  resources:
    limits:
      cpu: 4
      memory: 30Gi
    requests:
      cpu: 2
      memory: 25Gi
  livenessProbe:
    initialDelaySeconds: 8
    periodSeconds: 8
    failureThreshold: 24
    timeoutSeconds: 4
  readinessProbe:
    initialDelaySeconds: 8
    periodSeconds: 8
    timeoutSeconds: 4
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    failureThreshold: 120
 tei:
  EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5"
  # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
  resources:
    limits:
      cpu: 4
      memory: 4Gi
    requests:
      cpu: 2
      memory: 3Gi
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    failureThreshold: 24
    timeoutSeconds: 2
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 2
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    failureThreshold: 120
--- a/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
+++ b/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
@@ -4,12 +4,15 @@
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values
 vllm:
  enabled: false
 # TGI: largest bottleneck for ChatQnA
 tgi:
  enabled: true
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
--- a/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml
+++ b/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml
@@ -6,9 +6,9 @@
 tgi:
  enabled: false
 vllm:
  enabled: true
  shmSize: 1Gi
  accelDevice: "gaudi"
  image:
    repository: opea/vllm-gaudi
@@ -19,7 +19,7 @@ vllm:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
-    failureThreshold: 120
+    failureThreshold: 180
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
@@ -39,7 +39,6 @@ vllm:
    "--max-seq_len-to-capture", "2048"
  ]
 # Reranking: second largest bottleneck when reranking is in use
 # (i.e. query context docs have been uploaded with data-prep)
 #
--- a/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
+++ b/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
@@ -44,44 +44,13 @@ teirerank:
  readinessProbe:
    timeoutSeconds: 1
 tgi:
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
    tag: "2.0.6"
  resources:
    limits:
      habana.ai/gaudi: 1
  # higher limits are needed with extra input tokens added by rerank
  MAX_INPUT_LENGTH: "2048"
  MAX_TOTAL_TOKENS: "4096"
  CUDA_GRAPHS: ""
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  ENABLE_HPU_GRAPH: "true"
  LIMIT_HPU_GRAPH: "true"
  USE_FLASH_ATTENTION: "true"
  FLASH_ATTENTION_RECOMPUTE: "true"
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
 tgi-guardrails:
  enabled: true
  accelDevice: "gaudi"
  LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
@@ -106,3 +75,38 @@ tgi-guardrails:
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
 tgi:
  enabled: false
 vllm:
  enabled: true
  shmSize: 1Gi
  accelDevice: "gaudi"
  image:
    repository: opea/vllm-gaudi
  resources:
    limits:
      habana.ai/gaudi: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 180
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  extraCmdArgs: [
    "--tensor-parallel-size", "1",
    "--block-size", "128",
    "--max-num-seqs", "256",
    "--max-seq_len-to-capture", "2048"
  ]
--- a/ChatQnA/kubernetes/helm/guardrails-values.yaml
+++ b/ChatQnA/kubernetes/helm/guardrails-values.yaml
@@ -1,14 +0,0 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 image:
  repository: opea/chatqna-guardrails
 # guardrails related config
 guardrails-usvc:
  enabled: true
  # SAFETY_GUARD_ENDPOINT: "http://{{ .Release.Name }}-tgi-guardrails"
  SAFETY_GUARD_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
 tgi-guardrails:
  enabled: true
  LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
--- a/ChatQnA/kubernetes/helm/nv-values.yaml
+++ b/ChatQnA/kubernetes/helm/nv-values.yaml
@@ -1,25 +0,0 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 # To override values in subchart tgi
 tgi:
  accelDevice: "nvidia"
  image:
    repository: ghcr.io/huggingface/text-generation-inference
    tag: "2.2.0"
  resources:
    limits:
      nvidia.com/gpu: 1
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
--- a/ChatQnA/tests/test_manifest_guardrails_on_gaudi.sh
+++ b/ChatQnA/tests/test_manifest_guardrails_on_gaudi.sh
@@ -1,117 +0,0 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -xe
 USER_ID=$(whoami)
 LOG_PATH=/home/$(whoami)/logs
 MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
 IMAGE_REPO=${IMAGE_REPO:-opea}
 IMAGE_TAG=${IMAGE_TAG:-latest}
 ROLLOUT_TIMEOUT_SECONDS="1800s"
 KUBECTL_TIMEOUT_SECONDS="60s"
 function validate_chatqna() {
    local ns=$1
    local log=$2
    max_retry=20
    # make sure microservice retriever-usvc is ready
    # try to curl retriever-svc for max_retry times
    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
    for ((i=1; i<=max_retry; i++))
    do
        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
        curl http://$endpoint_url/v1/retrieval -X POST \
            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
            -H 'Content-Type: application/json' && break
        sleep 30
    done
    # if i is bigger than max_retry, then exit with error
    if [ $i -gt $max_retry ]; then
        echo "Microservice retriever failed, exit with error."
        return 1
    fi
    # make sure microservice tgi-svc is ready
    for ((i=1; i<=max_retry; i++))
    do
        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
        curl http://$endpoint_url/generate -X POST \
            -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
            -H 'Content-Type: application/json' && break
        sleep 30
    done
    # if i is bigger than max_retry, then exit with error
    if [ $i -gt $max_retry ]; then
        echo "Microservice tgi failed, exit with error."
        return 1
    fi
    # check megaservice works
    # generate a random logfile name to avoid conflict among multiple runners
    LOGFILE=$LOG_PATH/curlmega_$log.log
    endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
    exit_code=$?
    if [ $exit_code -ne 0 ]; then
        echo "Megaservice failed, please check the logs in $LOGFILE!"
        return ${exit_code}
    fi
    echo "Checking response results, make sure the output is reasonable. "
    local status=false
    if [[ -f $LOGFILE ]] &&
        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
        status=true
    fi
    if [[ $status == false ]]; then
        echo "Response check failed, please check the logs in artifacts!"
        return 1
    else
        echo "Response check succeed!"
    fi
    return 0
 }
 function install_chatqna() {
    echo "Testing manifests chatqna_guardrails"
    local ns=$1
    bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
    pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
    kubectl create namespace $ns
    # install guardrails
    kubectl apply -f chatqna-guardrails.yaml -n $ns
    # Sleep enough time for chatqna_guardrails to be ready
    sleep 60
 }
 if [ $# -eq 0 ]; then
    echo "Usage: $0 <function_name>"
    exit 1
 fi
 case "$1" in
    init_ChatQnA)
        pushd ChatQnA/tests/common
        bash _test_manifest_utils.sh init_ChatQnA
        popd
        ;;
    install_ChatQnA)
        NAMESPACE=$2
        install_chatqna $NAMESPACE
        popd
        ;;
    validate_ChatQnA)
        NAMESPACE=$2
        SERVICE_NAME=chatqna-guardrails
        validate_chatqna $NAMESPACE chatqna-guardrails
        ret=$?
        if [ $ret -ne 0 ]; then
            exit $ret
        fi
        ;;
    *)
        echo "Unknown function: $1"
        ;;
 esac
--- a/ChatQnA/tests/test_manifest_guardrails_on_xeon.sh
+++ b/ChatQnA/tests/test_manifest_guardrails_on_xeon.sh
@@ -1,117 +0,0 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -xe
 USER_ID=$(whoami)
 LOG_PATH=/home/$(whoami)/logs
 MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
 IMAGE_REPO=${IMAGE_REPO:-opea}
 IMAGE_TAG=${IMAGE_TAG:-latest}
 ROLLOUT_TIMEOUT_SECONDS="1800s"
 KUBECTL_TIMEOUT_SECONDS="60s"
 function validate_chatqna() {
    local ns=$1
    local log=$2
    max_retry=10
    # make sure microservice retriever-usvc is ready
    # try to curl retriever-svc for max_retry times
    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
    for ((i=1; i<=max_retry; i++))
    do
        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
        curl http://$endpoint_url/v1/retrieval -X POST \
            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
            -H 'Content-Type: application/json' && break
        sleep 30
    done
    # if i is bigger than max_retry, then exit with error
    if [ $i -gt $max_retry ]; then
        echo "Microservice retriever failed, exit with error."
        return 1
    fi
    # make sure microservice tgi-svc is ready
    for ((i=1; i<=max_retry; i++))
    do
        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
        curl http://$endpoint_url/generate -X POST \
            -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
            -H 'Content-Type: application/json' && break
        sleep 30
    done
    # if i is bigger than max_retry, then exit with error
    if [ $i -gt $max_retry ]; then
        echo "Microservice tgi failed, exit with error."
        return 1
    fi
    # check megaservice works
    # generate a random logfile name to avoid conflict among multiple runners
    LOGFILE=$LOG_PATH/curlmega_$log.log
    endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
    exit_code=$?
    if [ $exit_code -ne 0 ]; then
        echo "Megaservice failed, please check the logs in $LOGFILE!"
        return ${exit_code}
    fi
    echo "Checking response results, make sure the output is reasonable. "
    local status=false
    if [[ -f $LOGFILE ]] &&
        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
        status=true
    fi
    if [ $status == false ]; then
        echo "Response check failed, please check the logs in artifacts!"
        return 1
    else
        echo "Response check succeed!"
    fi
    return 0
 }
 function install_chatqna() {
    echo "Testing manifests chatqna_guardrails"
    local ns=$1
    bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
    pushd ChatQnA/kubernetes/intel/cpu/xeon/manifest
    kubectl create namespace $ns
    # install guardrail
    kubectl apply -f chatqna-guardrails.yaml -n $ns
    # Sleep enough time for chatqna_guardrails to be ready
    sleep 60
 }
 if [ $# -eq 0 ]; then
    echo "Usage: $0 <function_name>"
    exit 1
 fi
 case "$1" in
    init_ChatQnA)
        pushd ChatQnA/tests/common
        bash _test_manifest_utils.sh init_ChatQnA
        popd
        ;;
    install_ChatQnA)
        NAMESPACE=$2
        install_chatqna $NAMESPACE
        popd
        ;;
    validate_ChatQnA)
        NAMESPACE=$2
        SERVICE_NAME=chatqna-guardrails
        validate_chatqna $NAMESPACE chatqna-guardrails
        ret=$?
        if [ $ret -ne 0 ]; then
            exit $ret
        fi
        ;;
    *)
        echo "Unknown function: $1"
        ;;
 esac
--- a/ChatQnA/tests/test_manifest_on_gaudi.sh
+++ b/ChatQnA/tests/test_manifest_on_gaudi.sh
@@ -1,113 +0,0 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -xe
 USER_ID=$(whoami)
 LOG_PATH=/home/$(whoami)/logs
 MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
 IMAGE_REPO=${IMAGE_REPO:-opea}
 IMAGE_TAG=${IMAGE_TAG:-latest}
 ROLLOUT_TIMEOUT_SECONDS="1800s"
 KUBECTL_TIMEOUT_SECONDS="60s"
 function install_chatqna {
    echo "namespace is $NAMESPACE"
    kubectl apply -f chatqna.yaml -n $NAMESPACE
    # Sleep enough time for retreiver-usvc to be ready
    sleep 60
 }
 function validate_chatqna() {
    local ns=$1
    local log=$2
    max_retry=20
    # make sure microservice retriever-usvc is ready
    # try to curl retriever-svc for max_retry times
    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
    for ((i=1; i<=max_retry; i++))
    do
        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
        curl http://$endpoint_url/v1/retrieval -X POST \
            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
            -H 'Content-Type: application/json' && break
        sleep 30
    done
    # if i is bigger than max_retry, then exit with error
    if [ $i -gt $max_retry ]; then
        echo "Microservice retriever failed, exit with error."
        return 1
    fi
    # make sure microservice tgi-svc is ready
    for ((i=1; i<=max_retry; i++))
    do
        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
        curl http://$endpoint_url/generate -X POST \
            -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
            -H 'Content-Type: application/json' && break
        sleep 30
    done
    # if i is bigger than max_retry, then exit with error
    if [ $i -gt $max_retry ]; then
        echo "Microservice tgi failed, exit with error."
        return 1
    fi
    # check megaservice works
    # generate a random logfile name to avoid conflict among multiple runners
    LOGFILE=$LOG_PATH/curlmega_$log.log
    endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
    exit_code=$?
    if [ $exit_code -ne 0 ]; then
        echo "Megaservice failed, please check the logs in $LOGFILE!"
        return ${exit_code}
    fi
    echo "Checking response results, make sure the output is reasonable. "
    local status=false
    if [[ -f $LOGFILE ]] &&
        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
        status=true
    fi
    if [ $status == false ]; then
        echo "Response check failed, please check the logs in artifacts!"
        return 1
    else
        echo "Response check succeed!"
    fi
    return 0
 }
 if [ $# -eq 0 ]; then
    echo "Usage: $0 <function_name>"
    exit 1
 fi
 case "$1" in
    init_ChatQnA)
        pushd ChatQnA/tests/common
        bash _test_manifest_utils.sh init_ChatQnA
        popd
        ;;
    install_ChatQnA)
        pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
        NAMESPACE=$2
        install_chatqna
        popd
        ;;
    validate_ChatQnA)
        NAMESPACE=$2
        SERVICE_NAME=chatqna
        validate_chatqna $NAMESPACE chatqna
        ret=$?
        if [ $ret -ne 0 ]; then
            exit $ret
        fi
        ;;
    *)
        echo "Unknown function: $1"
        ;;
 esac
--- a/ChatQnA/tests/test_manifest_on_xeon.sh
+++ b/ChatQnA/tests/test_manifest_on_xeon.sh
@@ -1,112 +0,0 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -xe
 USER_ID=$(whoami)
 LOG_PATH=/home/$(whoami)/logs
 MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
 IMAGE_REPO=${IMAGE_REPO:-opea}
 IMAGE_TAG=${IMAGE_TAG:-latest}
 ROLLOUT_TIMEOUT_SECONDS="1800s"
 KUBECTL_TIMEOUT_SECONDS="60s"
 function install_chatqna {
    echo "namespace is $NAMESPACE"
    kubectl apply -f chatqna.yaml -n $NAMESPACE
    # Sleep enough time for retreiver-usvc to be ready
    sleep 60
 }
 function validate_chatqna() {
    local ns=$1
    local log=$2
    max_retry=10
    # make sure microservice retriever-usvc is ready
    # try to curl retriever-svc for max_retry times
    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
    for ((i=1; i<=max_retry; i++))
    do
        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-retriever-usvc" $ns)
        curl http://$endpoint_url/v1/retrieval -X POST \
            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
            -H 'Content-Type: application/json' && break
        sleep 30
    done
    # if i is bigger than max_retry, then exit with error
    if [ $i -gt $max_retry ]; then
        echo "Microservice retriever failed, exit with error."
        return 1
    fi
    # make sure microservice tgi-svc is ready
    for ((i=1; i<=max_retry; i++))
    do
        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna-tgi" $ns)
        curl http://$endpoint_url/generate -X POST \
            -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
            -H 'Content-Type: application/json' && break
        sleep 30
    done
    # if i is bigger than max_retry, then exit with error
    if [ $i -gt $max_retry ]; then
        echo "Microservice tgi failed, exit with error."
        return 1
    fi
    # check megaservice works
    # generate a random logfile name to avoid conflict among multiple runners
    LOGFILE=$LOG_PATH/curlmega_$log.log
    endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point "chatqna" $ns)
    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
    exit_code=$?
    if [ $exit_code -ne 0 ]; then
        echo "Megaservice failed, please check the logs in $LOGFILE!"
        return ${exit_code}
    fi
    echo "Checking response results, make sure the output is reasonable. "
    local status=false
    if [[ -f $LOGFILE ]] &&
        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
        status=true
    fi
    if [ $status == false ]; then
        echo "Response check failed, please check the logs in artifacts!"
        return 1
    else
        echo "Response check succeed!"
    fi
    return 0
 }
 if [ $# -eq 0 ]; then
    echo "Usage: $0 <function_name>"
    exit 1
 fi
 case "$1" in
    init_ChatQnA)
        pushd ChatQnA/tests/common
        bash _test_manifest_utils.sh init_ChatQnA
        popd
        ;;
    install_ChatQnA)
        pushd ChatQnA/kubernetes/intel/cpu/xeon/manifest
        NAMESPACE=$2
        install_chatqna
        popd
        ;;
    validate_ChatQnA)
        NAMESPACE=$2
        SERVICE_NAME=chatqna
        validate_chatqna $NAMESPACE chatqna
        ret=$?
        if [ $ret -ne 0 ]; then
            exit $ret
        fi
        ;;
    *)
        echo "Unknown function: $1"
        ;;
 esac
--- a/ChatQnA/tests/test_manifest_vllm_on_gaudi.sh
+++ b/ChatQnA/tests/test_manifest_vllm_on_gaudi.sh
@@ -1,118 +0,0 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -xe
 USER_ID=$(whoami)
 LOG_PATH=/home/$(whoami)/logs
 MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
 IMAGE_REPO=${IMAGE_REPO:-opea}
 IMAGE_TAG=${IMAGE_TAG:-latest}
 ROLLOUT_TIMEOUT_SECONDS="1800s"
 KUBECTL_TIMEOUT_SECONDS="60s"
 function validate_chatqna() {
    local ns=$1
    local log=$2
    max_retry=20
    # make sure microservice retriever-usvc is ready
    # try to curl retriever-svc for max_retry times
    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
    for ((i=1; i<=max_retry; i++))
    do
        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point  "chatqna-retriever-usvc" $ns)
        curl http://$endpoint_url/v1/retrieval -X POST \
            -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" \
            -H 'Content-Type: application/json' && break
        sleep 30
    done
    # if i is bigger than max_retry, then exit with error
    if [ $i -gt $max_retry ]; then
        echo "Microservice retriever failed, exit with error."
        return 1
    fi
    # make sure microservice vllm-svc is ready
    for ((i=1; i<=max_retry; i++))
    do
        endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point  "chatqna-vllm" $ns)
        curl http://$endpoint_url/v1/chat/completions -X POST \
            -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}' \
            -H 'Content-Type: application/json' && break
        sleep 30
    done
    # if i is bigger than max_retry, then exit with error
    if [ $i -gt $max_retry ]; then
        echo "Microservice vllm failed, exit with error."
        return 1
    fi
    # check megaservice works
    # generate a random logfile name to avoid conflict among multiple runners
    LOGFILE=$LOG_PATH/curlmega_$log.log
    endpoint_url=$(bash ChatQnA/tests/common/_test_manifest_utils.sh get_end_point  "chatqna" $ns)
    curl http://$endpoint_url/v1/chatqna -H "Content-Type: application/json" -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": "What is the revenue of Nike in 2023?"}' > $LOGFILE
    exit_code=$?
    if [ $exit_code -ne 0 ]; then
        echo "Megaservice failed, please check the logs in $LOGFILE!"
        return ${exit_code}
    fi
    echo "Checking response results, make sure the output is reasonable. "
    local status=false
    if [[ -f $LOGFILE ]] &&
        [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
        status=true
    fi
    if [ $status == false ]; then
        echo "Response check failed, please check the logs in artifacts!"
        return 1
    else
        echo "Response check succeed!"
    fi
    return 0
 }
 function install_chatqna() {
    echo "Testing manifests chatqna_vllm"
    local ns=$1
    bash ChatQnA/tests/common/_test_manifest_utils.sh _cleanup_ns $ns
    kubectl create namespace $ns
    # install guardrail
    pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest
    kubectl apply -f chatqna-vllm.yaml -n $ns
    # Sleep enough time for chatqna_vllm to be ready, vllm warmup takes about 5 minutes
    sleep 280
 }
 if [ $# -eq 0 ]; then
    echo "Usage: $0 <function_name>"
    exit 1
 fi
 case "$1" in
    init_ChatQnA)
        pushd ChatQnA/tests/common
        bash _test_manifest_utils.sh init_ChatQnA
        popd
        ;;
    install_ChatQnA)
        NAMESPACE=$2
        install_chatqna $NAMESPACE
        popd
        ;;
    validate_ChatQnA)
        NAMESPACE=$2
        SERVICE_NAME=chatqna-vllm
        validate_chatqna $NAMESPACE chatqna-vllm
        ret=$?
        if [ $ret -ne 0 ]; then
            exit $ret
        fi
        ;;
    *)
        echo "Unknown function: $1"
        ;;
 esac
--- a/CodeGen/kubernetes/helm/gaudi-values.yaml
+++ b/CodeGen/kubernetes/helm/gaudi-values.yaml
@@ -6,13 +6,18 @@ tgi:
  LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
  MAX_INPUT_LENGTH: "1024"
  MAX_TOTAL_TOKENS: "2048"
  CUDA_GRAPHS: ""
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  ENABLE_HPU_GRAPH: "true"
  LIMIT_HPU_GRAPH: "true"
  USE_FLASH_ATTENTION: "true"
  FLASH_ATTENTION_RECOMPUTE: "true"
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
--- a/CodeGen/tests/test_manifest_on_gaudi.sh
+++ b/CodeGen/tests/test_manifest_on_gaudi.sh
@@ -1,85 +0,0 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -xe
 USER_ID=$(whoami)
 LOG_PATH=/home/$(whoami)/logs
 MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
 IMAGE_REPO=${IMAGE_REPO:-opea}
 IMAGE_TAG=${IMAGE_TAG:-latest}
 function init_codegen() {
    # executed under path manifest/codegen/xeon
    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
    # replace microservice image tag
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
    # set huggingface token
    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
 }
 function install_codegen {
    echo "namespace is $NAMESPACE"
    kubectl apply -f codegen.yaml -n $NAMESPACE
 }
 function validate_codegen() {
    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
    echo "try to curl http://${ip_address}:${port}/v1/codegen..."
    # generate a random logfile name to avoid conflict among multiple runners
    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
    # Curl the Mega Service
    curl http://${ip_address}:${port}/v1/codegen -H "Content-Type: application/json" \
    -d '{"messages": "def print_hello_world():"}' > $LOGFILE
    exit_code=$?
    if [ $exit_code -ne 0 ]; then
        echo "Megaservice codegen failed, please check the logs in $LOGFILE!"
        exit 1
    fi
    echo "Checking response results, make sure the output is reasonable. "
    local status=false
    if [[ -f $LOGFILE ]] && \
    [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
        status=true
    fi
    if [ $status == false ]; then
        echo "Response check failed, please check the logs in artifacts!"
        exit 1
    else
        echo "Response check succeed!"
    fi
 }
 if [ $# -eq 0 ]; then
    echo "Usage: $0 <function_name>"
    exit 1
 fi
 case "$1" in
    init_CodeGen)
        pushd CodeGen/kubernetes/intel/hpu/gaudi/manifest
        init_codegen
        popd
        ;;
    install_CodeGen)
        pushd CodeGen/kubernetes/intel/hpu/gaudi/manifest
        NAMESPACE=$2
        install_codegen
        popd
        ;;
    validate_CodeGen)
        NAMESPACE=$2
        SERVICE_NAME=codegen
        validate_codegen
        ;;
    *)
        echo "Unknown function: $1"
        ;;
 esac
--- a/CodeGen/tests/test_manifest_on_xeon.sh
+++ b/CodeGen/tests/test_manifest_on_xeon.sh
@@ -1,85 +0,0 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -xe
 USER_ID=$(whoami)
 LOG_PATH=/home/$(whoami)/logs
 MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
 IMAGE_REPO=${IMAGE_REPO:-opea}
 IMAGE_TAG=${IMAGE_TAG:-latest}
 function init_codegen() {
    # executed under path manifest/codegen/xeon
    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
    # replace microservice image tag
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
    # set huggingface token
    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
 }
 function install_codegen {
    echo "namespace is $NAMESPACE"
    kubectl apply -f codegen.yaml -n $NAMESPACE
 }
 function validate_codegen() {
    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
    echo "try to curl http://${ip_address}:${port}/v1/codegen..."
    # generate a random logfile name to avoid conflict among multiple runners
    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
    # Curl the Mega Service
    curl http://${ip_address}:${port}/v1/codegen -H "Content-Type: application/json" \
    -d '{"messages": "def print_hello_world():"}' > $LOGFILE
    exit_code=$?
    if [ $exit_code -ne 0 ]; then
        echo "Megaservice codegen failed, please check the logs in $LOGFILE!"
        exit 1
    fi
    echo "Checking response results, make sure the output is reasonable. "
    local status=false
    if [[ -f $LOGFILE ]] && \
    [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
        status=true
    fi
    if [ $status == false ]; then
        echo "Response check failed, please check the logs in artifacts!"
        exit 1
    else
        echo "Response check succeed!"
    fi
 }
 if [ $# -eq 0 ]; then
    echo "Usage: $0 <function_name>"
    exit 1
 fi
 case "$1" in
    init_CodeGen)
        pushd CodeGen/kubernetes/intel/cpu/xeon/manifest
        init_codegen
        popd
        ;;
    install_CodeGen)
        pushd CodeGen/kubernetes/intel/cpu/xeon/manifest
        NAMESPACE=$2
        install_codegen
        popd
        ;;
    validate_CodeGen)
        NAMESPACE=$2
        SERVICE_NAME=codegen
        validate_codegen
        ;;
    *)
        echo "Unknown function: $1"
        ;;
 esac
--- a/CodeTrans/kubernetes/helm/gaudi-values.yaml
+++ b/CodeTrans/kubernetes/helm/gaudi-values.yaml
@@ -5,13 +5,18 @@ tgi:
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
  MAX_INPUT_LENGTH: "1024"
  MAX_TOTAL_TOKENS: "2048"
  CUDA_GRAPHS: ""
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  ENABLE_HPU_GRAPH: "true"
  LIMIT_HPU_GRAPH: "true"
  USE_FLASH_ATTENTION: "true"
  FLASH_ATTENTION_RECOMPUTE: "true"
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
--- a/CodeTrans/tests/test_manifest_on_gaudi.sh
+++ b/CodeTrans/tests/test_manifest_on_gaudi.sh
@@ -1,86 +0,0 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -xe
 USER_ID=$(whoami)
 LOG_PATH=/home/$(whoami)/logs
 MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
 IMAGE_REPO=${IMAGE_REPO:-opea}
 IMAGE_TAG=${IMAGE_TAG:-latest}
 function init_codetrans() {
    # executed under path manifest/codetrans/xeon
    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
    # replace microservice image tag
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
    # set huggingface token
    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
 }
 function install_codetrans {
    echo "namespace is $NAMESPACE"
    kubectl apply -f codetrans.yaml -n $NAMESPACE
 }
 function validate_codetrans() {
    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
    echo "try to curl http://${ip_address}:${port}/v1/codetrans..."
    # generate a random logfile name to avoid conflict among multiple runners
    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
    # Curl the Mega Service
    curl http://${ip_address}:${port}/v1/codetrans \
    -H 'Content-Type: application/json' \
    -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}"}' > $LOGFILE
    exit_code=$?
    if [ $exit_code -ne 0 ]; then
        echo "Megaservice codetrans failed, please check the logs in $LOGFILE!"
        exit 1
    fi
    echo "Checking response results, make sure the output is reasonable. "
    local status=false
    if [[ -f $LOGFILE ]] && \
    [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
        status=true
    fi
    if [ $status == false ]; then
        echo "Response check failed, please check the logs in artifacts!"
        exit 1
    else
        echo "Response check succeed!"
    fi
 }
 if [ $# -eq 0 ]; then
    echo "Usage: $0 <function_name>"
    exit 1
 fi
 case "$1" in
    init_CodeTrans)
        pushd CodeTrans/kubernetes/intel/hpu/gaudi/manifest
        init_codetrans
        popd
        ;;
    install_CodeTrans)
        pushd CodeTrans/kubernetes/intel/hpu/gaudi/manifest
        NAMESPACE=$2
        install_codetrans
        popd
        ;;
    validate_CodeTrans)
        NAMESPACE=$2
        SERVICE_NAME=codetrans
        validate_codetrans
        ;;
    *)
        echo "Unknown function: $1"
        ;;
 esac
--- a/CodeTrans/tests/test_manifest_on_xeon.sh
+++ b/CodeTrans/tests/test_manifest_on_xeon.sh
@@ -1,86 +0,0 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -xe
 USER_ID=$(whoami)
 LOG_PATH=/home/$(whoami)/logs
 MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
 IMAGE_REPO=${IMAGE_REPO:-opea}
 IMAGE_TAG=${IMAGE_TAG:-latest}
 function init_codetrans() {
    # executed under path manifest/codetrans/xeon
    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
    # replace microservice image tag
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
    # set huggingface token
    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
 }
 function install_codetrans {
    echo "namespace is $NAMESPACE"
    kubectl apply -f codetrans.yaml -n $NAMESPACE
 }
 function validate_codetrans() {
    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
    echo "try to curl http://${ip_address}:${port}/v1/codetrans..."
    # generate a random logfile name to avoid conflict among multiple runners
    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
    # Curl the Mega Service
    curl http://${ip_address}:${port}/v1/codetrans \
    -H 'Content-Type: application/json' \
    -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n}"}' > $LOGFILE
    exit_code=$?
    if [ $exit_code -ne 0 ]; then
        echo "Megaservice codetrans failed, please check the logs in $LOGFILE!"
        exit 1
    fi
    echo "Checking response results, make sure the output is reasonable. "
    local status=false
    if [[ -f $LOGFILE ]] && \
    [[ $(grep -c "print" $LOGFILE) != 0 ]]; then
        status=true
    fi
    if [ $status == false ]; then
        echo "Response check failed, please check the logs in artifacts!"
        exit 1
    else
        echo "Response check succeed!"
    fi
 }
 if [ $# -eq 0 ]; then
    echo "Usage: $0 <function_name>"
    exit 1
 fi
 case "$1" in
    init_CodeTrans)
        pushd CodeTrans/kubernetes/intel/cpu/xeon/manifest
        init_codetrans
        popd
        ;;
    install_CodeTrans)
        pushd CodeTrans/kubernetes/intel/cpu/xeon/manifest
        NAMESPACE=$2
        install_codetrans
        popd
        ;;
    validate_CodeTrans)
        NAMESPACE=$2
        SERVICE_NAME=codetrans
        validate_codetrans
        ;;
    *)
        echo "Unknown function: $1"
        ;;
 esac
--- a/DocSum/kubernetes/helm/cpu-values.yaml
+++ b/DocSum/kubernetes/helm/cpu-values.yaml
@@ -2,4 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 tgi:
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+  enabled: true
 vllm:
  enabled: false
--- a/DocSum/kubernetes/helm/gaudi-values.yaml
+++ b/DocSum/kubernetes/helm/gaudi-values.yaml
@@ -1,16 +1,21 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 vllm:
  enabled: false
 llm-uservice:
  DOCSUM_BACKEND: "TGI"
 tgi:
  enabled: true
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
  MAX_INPUT_LENGTH: "1024"
  MAX_TOTAL_TOKENS: "2048"
  CUDA_GRAPHS: ""
  ENABLE_HPU_GRAPH: true
  LIMIT_HPU_GRAPH: true
--- a/DocSum/tests/test_manifest_on_gaudi.sh
+++ b/DocSum/tests/test_manifest_on_gaudi.sh
@@ -1,87 +0,0 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -xe
 USER_ID=$(whoami)
 LOG_PATH=/home/$(whoami)/logs
 MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
 IMAGE_REPO=${IMAGE_REPO:-opea}
 IMAGE_TAG=${IMAGE_TAG:-latest}
 function init_docsum() {
    # executed under path manifest/docsum/xeon
    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
    # replace microservice image tag
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
    # set huggingface token
    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
 }
 function install_docsum {
    echo "namespace is $NAMESPACE"
    kubectl apply -f docsum.yaml -n $NAMESPACE
 }
 function validate_docsum() {
    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
    echo "try to curl http://${ip_address}:${port}/v1/docsum..."
    # generate a random logfile name to avoid conflict among multiple runners
    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
    # Curl the Mega Service
    curl http://${ip_address}:${port}/v1/docsum \
    -H 'Content-Type: multipart/form-data' \
    -F 'type=text' \
    -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE
    exit_code=$?
    if [ $exit_code -ne 0 ]; then
        echo "Megaservice docsum failed, please check the logs in $LOGFILE!"
        exit 1
    fi
    echo "Checking response results, make sure the output is reasonable. "
    local status=false
    if [[ -f $LOGFILE ]] && \
    [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
        status=true
    fi
    if [ $status == false ]; then
        echo "Response check failed, please check the logs in artifacts!"
        exit 1
    else
        echo "Response check succeed!"
    fi
 }
 if [ $# -eq 0 ]; then
    echo "Usage: $0 <function_name>"
    exit 1
 fi
 case "$1" in
    init_DocSum)
        pushd DocSum/kubernetes/intel/hpu/gaudi/manifest
        init_docsum
        popd
        ;;
    install_DocSum)
        pushd DocSum/kubernetes/intel/hpu/gaudi/manifest
        NAMESPACE=$2
        install_docsum
        popd
        ;;
    validate_DocSum)
        NAMESPACE=$2
        SERVICE_NAME=docsum
        validate_docsum
        ;;
    *)
        echo "Unknown function: $1"
        ;;
 esac
--- a/DocSum/tests/test_manifest_on_xeon.sh
+++ b/DocSum/tests/test_manifest_on_xeon.sh
@@ -1,87 +0,0 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -xe
 USER_ID=$(whoami)
 LOG_PATH=/home/$(whoami)/logs
 MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
 IMAGE_REPO=${IMAGE_REPO:-opea}
 IMAGE_TAG=${IMAGE_TAG:-latest}
 function init_docsum() {
    # executed under path manifest/docsum/xeon
    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
    # replace microservice image tag
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/opea/"
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
    # set huggingface token
    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
 }
 function install_docsum {
    echo "namespace is $NAMESPACE"
    kubectl apply -f docsum.yaml -n $NAMESPACE
 }
 function validate_docsum() {
    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
    echo "try to curl http://${ip_address}:${port}/v1/docsum..."
    # generate a random logfile name to avoid conflict among multiple runners
    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
    # Curl the Mega Service
    curl http://${ip_address}:${port}/v1/docsum \
    -H 'Content-Type: multipart/form-data' \
    -F 'type=text' \
    -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE
    exit_code=$?
    if [ $exit_code -ne 0 ]; then
        echo "Megaservice docsum failed, please check the logs in $LOGFILE!"
        exit 1
    fi
    echo "Checking response results, make sure the output is reasonable. "
    local status=false
    if [[ -f $LOGFILE ]] && \
    [[ $(grep -c "\[DONE\]" $LOGFILE) != 0 ]]; then
        status=true
    fi
    if [ $status == false ]; then
        echo "Response check failed, please check the logs in artifacts!"
        exit 1
    else
        echo "Response check succeed!"
    fi
 }
 if [ $# -eq 0 ]; then
    echo "Usage: $0 <function_name>"
    exit 1
 fi
 case "$1" in
    init_DocSum)
        pushd DocSum/kubernetes/intel/cpu/xeon/manifest
        init_docsum
        popd
        ;;
    install_DocSum)
        pushd DocSum/kubernetes/intel/cpu/xeon/manifest
        NAMESPACE=$2
        install_docsum
        popd
        ;;
    validate_DocSum)
        NAMESPACE=$2
        SERVICE_NAME=docsum
        validate_docsum
        ;;
    *)
        echo "Unknown function: $1"
        ;;
 esac
--- a/FaqGen/kubernetes/helm/gaudi-values.yaml
+++ b/FaqGen/kubernetes/helm/gaudi-values.yaml
@@ -5,13 +5,25 @@ tgi:
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "4096"
+  MAX_INPUT_LENGTH: "1024"
-  MAX_TOTAL_TOKENS: "8192"
+  MAX_TOTAL_TOKENS: "2048"
  CUDA_GRAPHS: "0"
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  ENABLE_HPU_GRAPH: "true"
  LIMIT_HPU_GRAPH: "true"
  USE_FLASH_ATTENTION: "true"
  FLASH_ATTENTION_RECOMPUTE: "true"
  PREFILL_BATCH_BUCKET_SIZE: 1
  BATCH_BUCKET_SIZE: 8
  extraCmdArgs:
    - "--max-batch-total-tokens"
    - "65536"
    - "--max-batch-prefill-tokens"
    - "4096"
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
--- a/SearchQnA/kubernetes/helm/README.md
+++ b/SearchQnA/kubernetes/helm/README.md
@@ -0,0 +1,18 @@
 # Deploy SearchQnA on Kubernetes cluster
 - You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information.
 - For more deploy options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme).
 ## Deploy on Xeon
 ```
 export HFTOKEN="insert-your-huggingface-token-here"
 helm install searchqna oci://ghcr.io/opea-project/charts/searchqna  --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml
 ```
 ## Deploy on Gaudi
 ```
 export HFTOKEN="insert-your-huggingface-token-here"
 helm install searchqna oci://ghcr.io/opea-project/charts/searchqna  --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml
 ```
--- a/SearchQnA/kubernetes/helm/cpu-values.yaml
+++ b/SearchQnA/kubernetes/helm/cpu-values.yaml
@@ -0,0 +1,7 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 tgi:
  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
 llm_uservice:
  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
--- a/SearchQnA/kubernetes/helm/gaudi-values.yaml
+++ b/SearchQnA/kubernetes/helm/gaudi-values.yaml
@@ -0,0 +1,50 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 tgi:
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
  MAX_INPUT_LENGTH: "2048"
  MAX_TOTAL_TOKENS: "4096"
  CUDA_GRAPHS: ""
  HF_HUB_DISABLE_PROGRESS_BARS: 1
  HF_HUB_ENABLE_HF_TRANSFER: 0
  ENABLE_HPU_GRAPH: true
  LIMIT_HPU_GRAPH: true
  USE_FLASH_ATTENTION: true
  FLASH_ATTENTION_RECOMPUTE: true
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
 tei:
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tei-gaudi
    tag: "1.5.0"
  OMPI_MCA_btl_vader_single_copy_mechanism: none
  MAX_WARMUP_SEQUENCE_LENGTH: 512
  securityContext:
    readOnlyRootFilesystem: false
  resources:
    limits:
      habana.ai/gaudi: 1
  livenessProbe:
    timeoutSeconds: 1
  readinessProbe:
    timeoutSeconds: 1
--- a/Text2Image/kubernetes/helm/README.md
+++ b/Text2Image/kubernetes/helm/README.md
@@ -0,0 +1,18 @@
 # Deploy txt2img on Kubernetes cluster
 - You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information.
 - For more deploy options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme).
 ## Deploy on Xeon
 ```
 export HFTOKEN="insert-your-huggingface-token-here"
 helm install txt2img oci://ghcr.io/opea-project/charts/txt2img  --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml
 ```
 ## Deploy on Gaudi
 ```
 export HFTOKEN="insert-your-huggingface-token-here"
 helm install txt2img oci://ghcr.io/opea-project/charts/txt2img  --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml
 ```
--- a/Text2Image/kubernetes/helm/cpu-values.yaml
+++ b/Text2Image/kubernetes/helm/cpu-values.yaml
@@ -0,0 +1,6 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 text2image:
  image:
    repository: opea/text2image
--- a/Text2Image/kubernetes/helm/gaudi-values.yaml
+++ b/Text2Image/kubernetes/helm/gaudi-values.yaml
@@ -0,0 +1,30 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 text2image:
  accelDevice: "gaudi"
  image:
    repository: opea/text2image-gaudi
  resources:
    limits:
      habana.ai/gaudi: 1
      # The following hugepage related settings is for default MODEL stable-diffusion-v1-5/stable-diffusion-v1-5
      # User should change the resource limits for other models
      hugepages-2Mi: 256Mi
  volumes:
    - name: hugepage-2mi
      emptyDir:
        medium: HugePages-2Mi
  volumeMounts:
    - name: hugepage-2mi
      mountPath: /hugepages-2Mi
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
--- a/Translation/tests/test_manifest_on_gaudi.sh
+++ b/Translation/tests/test_manifest_on_gaudi.sh
@@ -1,86 +0,0 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -xe
 USER_ID=$(whoami)
 LOG_PATH=/home/$(whoami)/logs
 MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
 IMAGE_REPO=${IMAGE_REPO:-opea}
 IMAGE_TAG=${IMAGE_TAG:-latest}
 function init_translation() {
    # executed under path manifest/translation/xeon
    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
    # replace microservice image tag
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
    # set huggingface token
    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
 }
 function install_translation {
    echo "namespace is $NAMESPACE"
    kubectl apply -f translation.yaml -n $NAMESPACE
    sleep 50s
 }
 function validate_translation() {
    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
    echo "try to curl http://${ip_address}:${port}/v1/translation..."
    # generate a random logfile name to avoid conflict among multiple runners
    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
    # Curl the Mega Service
    curl http://${ip_address}:${port}/v1/translation \
    -H 'Content-Type: application/json' \
    -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' > $LOGFILE
    exit_code=$?
    if [ $exit_code -ne 0 ]; then
        echo "Megaservice translation failed, please check the logs in $LOGFILE!"
        exit 1
    fi
    echo "Checking response results, make sure the output is reasonable. "
    local status=false
    if [[ -f $LOGFILE ]] && \
    [[ $(grep -c "translation" $LOGFILE) != 0 ]]; then
        status=true
    fi
    if [ $status == false ]; then
        echo "Response check failed, please check the logs in artifacts!"
    else
        echo "Response check succeed!"
    fi
 }
 if [ $# -eq 0 ]; then
    echo "Usage: $0 <function_name>"
    exit 1
 fi
 case "$1" in
    init_Translation)
        pushd Translation/kubernetes/intel/hpu/gaudi/manifest
        init_translation
        popd
        ;;
    install_Translation)
        pushd Translation/kubernetes/intel/hpu/gaudi/manifest
        NAMESPACE=$2
        install_translation
        popd
        ;;
    validate_Translation)
        NAMESPACE=$2
        SERVICE_NAME=translation
        validate_translation
        ;;
    *)
        echo "Unknown function: $1"
        ;;
 esac
--- a/Translation/tests/test_manifest_on_xeon.sh
+++ b/Translation/tests/test_manifest_on_xeon.sh
@@ -1,85 +0,0 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -xe
 USER_ID=$(whoami)
 LOG_PATH=/home/$(whoami)/logs
 MOUNT_DIR=/home/$USER_ID/.cache/huggingface/hub
 IMAGE_REPO=${IMAGE_REPO:-opea}
 IMAGE_TAG=${IMAGE_TAG:-latest}
 function init_translation() {
    # executed under path manifest/translation/xeon
    # replace the mount dir "path: /mnt/model" with "path: $CHART_MOUNT"
    find . -name '*.yaml' -type f -exec sed -i "s#path: /mnt/opea-models#path: $MOUNT_DIR#g" {} \;
    # replace microservice image tag
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/\(.*\):latest#image: \"opea/\1:${IMAGE_TAG}#g" {} \;
    # replace the repository "image: opea/*" with "image: $IMAGE_REPO/"
    find . -name '*.yaml' -type f -exec sed -i "s#image: \"opea/*#image: \"${IMAGE_REPO}/#g" {} \;
    # set huggingface token
    find . -name '*.yaml' -type f -exec sed -i "s#insert-your-huggingface-token-here#$(cat /home/$USER_ID/.cache/huggingface/token)#g" {} \;
 }
 function install_translation {
    echo "namespace is $NAMESPACE"
    kubectl apply -f translation.yaml -n $NAMESPACE
 }
 function validate_translation() {
    ip_address=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.clusterIP}')
    port=$(kubectl get svc $SERVICE_NAME -n $NAMESPACE -o jsonpath='{.spec.ports[0].port}')
    echo "try to curl http://${ip_address}:${port}/v1/translation..."
    # generate a random logfile name to avoid conflict among multiple runners
    LOGFILE=$LOG_PATH/curlmega_$NAMESPACE.log
    # Curl the Mega Service
    curl http://${ip_address}:${port}/v1/translation \
    -H 'Content-Type: application/json' \
    -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' > $LOGFILE
    exit_code=$?
    if [ $exit_code -ne 0 ]; then
        echo "Megaservice translation failed, please check the logs in $LOGFILE!"
        exit 1
    fi
    echo "Checking response results, make sure the output is reasonable. "
    local status=false
    if [[ -f $LOGFILE ]] && \
    [[ $(grep -c "translation" $LOGFILE) != 0 ]]; then
        status=true
    fi
    if [ $status == false ]; then
        echo "Response check failed, please check the logs in artifacts!"
    else
        echo "Response check succeed!"
    fi
 }
 if [ $# -eq 0 ]; then
    echo "Usage: $0 <function_name>"
    exit 1
 fi
 case "$1" in
    init_Translation)
        pushd Translation/kubernetes/intel/cpu/xeon/manifest
        init_translation
        popd
        ;;
    install_Translation)
        pushd Translation/kubernetes/intel/cpu/xeon/manifest
        NAMESPACE=$2
        install_translation
        popd
        ;;
    validate_Translation)
        NAMESPACE=$2
        SERVICE_NAME=translation
        validate_translation
        ;;
    *)
        echo "Unknown function: $1"
        ;;
 esac
--- a/VisualQnA/kubernetes/helm/gaudi-values.yaml
+++ b/VisualQnA/kubernetes/helm/gaudi-values.yaml
@@ -9,13 +9,18 @@ tgi:
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.0.6"
+    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
  MAX_INPUT_LENGTH: "4096"
  MAX_TOTAL_TOKENS: "8192"
  CUDA_GRAPHS: ""
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  ENABLE_HPU_GRAPH: "true"
  LIMIT_HPU_GRAPH: "true"
  USE_FLASH_ATTENTION: "true"
  FLASH_ATTENTION_RECOMPUTE: "true"
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5