Sync values yaml file for 1.3 release (#1748)

Signed-off-by: Dolpher Du <dolpher.du@intel.com>
2025-04-08 22:39:40 +08:00
parent b14db6dbd3
commit 46ebb78aa3
34 changed files with 580 additions and 212 deletions
--- a/AgentQnA/kubernetes/helm/cpu-values.yaml
+++ b/AgentQnA/kubernetes/helm/cpu-values.yaml
@@ -0,0 +1,22 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 tgi:
  enabled: false
 vllm:
  enabled: true
  LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
  extraCmdArgs: ["--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
 supervisor:
  llm_endpoint_url: http://{{ .Release.Name }}-vllm
  llm_engine: vllm
  model: "meta-llama/Meta-Llama-3-8B-Instruct"
 ragagent:
  llm_endpoint_url: http://{{ .Release.Name }}-vllm
  llm_engine: vllm
  model: "meta-llama/Meta-Llama-3-8B-Instruct"
 sqlagent:
  llm_endpoint_url: http://{{ .Release.Name }}-vllm
  llm_engine: vllm
  model: "meta-llama/Meta-Llama-3-8B-Instruct"
--- a/AgentQnA/kubernetes/helm/gaudi-values.yaml
+++ b/AgentQnA/kubernetes/helm/gaudi-values.yaml
@@ -4,13 +4,32 @@
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values
 tgi:
  enabled: false
 vllm:
  enabled: true
  accelDevice: "gaudi"
  image:
    repository: opea/vllm-gaudi
  resources:
    limits:
      habana.ai/gaudi: 4
  LLM_MODEL_ID: "meta-llama/Llama-3.3-70B-Instruct"
  OMPI_MCA_btl_vader_single_copy_mechanism: none
  PT_HPU_ENABLE_LAZY_COLLECTIVES: true
  VLLM_SKIP_WARMUP: true
  shmSize: 16Gi
  extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
 supervisor:
  llm_endpoint_url: http://{{ .Release.Name }}-vllm
  llm_engine: vllm
  model: "meta-llama/Llama-3.3-70B-Instruct"
 ragagent:
  llm_endpoint_url: http://{{ .Release.Name }}-vllm
  llm_engine: vllm
  model: "meta-llama/Llama-3.3-70B-Instruct"
 sqlagent:
  llm_endpoint_url: http://{{ .Release.Name }}-vllm
  llm_engine: vllm
  model: "meta-llama/Llama-3.3-70B-Instruct"
--- a/AudioQnA/kubernetes/helm/cpu-multilang-values.yaml
+++ b/AudioQnA/kubernetes/helm/cpu-multilang-values.yaml
@@ -0,0 +1,15 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 tgi:
  enabled: false
 vllm:
  enabled: true
 speecht5:
  enabled: false
 gpt-sovits:
  enabled: true
 image:
  repository: opea/audioqna-multilang
--- a/AudioQnA/kubernetes/helm/cpu-tgi-values.yaml
+++ b/AudioQnA/kubernetes/helm/cpu-tgi-values.yaml
@@ -0,0 +1,12 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 tgi:
  enabled: true
 vllm:
  enabled: false
 speecht5:
  enabled: true
 gpt-sovits:
  enabled: false
--- a/AudioQnA/kubernetes/helm/cpu-values.yaml
+++ b/AudioQnA/kubernetes/helm/cpu-values.yaml
@@ -2,4 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 tgi:
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+  enabled: false
 vllm:
  enabled: true
 speecht5:
  enabled: true
 gpt-sovits:
  enabled: false
--- a/AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml
+++ b/AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml
@@ -0,0 +1,49 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 tgi:
  enabled: true
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
  MAX_INPUT_LENGTH: "1024"
  MAX_TOTAL_TOKENS: "2048"
  CUDA_GRAPHS: ""
  HF_HUB_DISABLE_PROGRESS_BARS: 1
  HF_HUB_ENABLE_HF_TRANSFER: 0
  ENABLE_HPU_GRAPH: true
  LIMIT_HPU_GRAPH: true
  USE_FLASH_ATTENTION: true
  FLASH_ATTENTION_RECOMPUTE: true
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
 vllm:
  enabled: false
 whisper:
  image:
    repository: opea/whisper-gaudi
  resources:
    limits:
      habana.ai/gaudi: 1
 speecht5:
  enabled: true
  image:
    repository: opea/speecht5-gaudi
  resources:
    limits:
      habana.ai/gaudi: 1
 gpt-sovits:
  enabled: false
--- a/AudioQnA/kubernetes/helm/gaudi-values.yaml
+++ b/AudioQnA/kubernetes/helm/gaudi-values.yaml
@@ -2,35 +2,27 @@
 # SPDX-License-Identifier: Apache-2.0
 tgi:
  enabled: false
 vllm:
  enabled: true
  accelDevice: "gaudi"
  image:
-    repository: ghcr.io/huggingface/tgi-gaudi
+    repository: opea/vllm-gaudi
-    tag: "2.3.1"
+  startupProbe:
    failureThreshold: 360
  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  resources:
    limits:
      habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "1024"
+  extraCmdArgs: [
-  MAX_TOTAL_TOKENS: "2048"
+    "--tensor-parallel-size", "1",
-  CUDA_GRAPHS: ""
+    "--block-size", "128",
-  HF_HUB_DISABLE_PROGRESS_BARS: 1
+    "--max-num-seqs", "256",
-  HF_HUB_ENABLE_HF_TRANSFER: 0
+    "--max-seq_len-to-capture", "2048"
-  ENABLE_HPU_GRAPH: true
+  ]
  LIMIT_HPU_GRAPH: true
  USE_FLASH_ATTENTION: true
  FLASH_ATTENTION_RECOMPUTE: true
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
 whisper:
  image:
@@ -40,8 +32,11 @@ whisper:
      habana.ai/gaudi: 1
 speecht5:
  enabled: true
  image:
    repository: opea/speecht5-gaudi
  resources:
    limits:
      habana.ai/gaudi: 1
 gpt-sovits:
  enabled: false
--- a/ChatQnA/kubernetes/helm/cpu-qdrant-values.yaml
+++ b/ChatQnA/kubernetes/helm/cpu-qdrant-values.yaml
@@ -0,0 +1,12 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 redis-vector-db:
  enabled: false
 qdrant:
  enabled: true
 data-prep:
  DATAPREP_BACKEND: "QDRANT"
 retriever-usvc:
  RETRIEVER_BACKEND: "QDRANT"
--- a/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
+++ b/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
@@ -44,11 +44,6 @@ tgi:
  #    cpu: 6
  #    memory: 65Gi
  livenessProbe:
    initialDelaySeconds: 8
    periodSeconds: 8
    failureThreshold: 24
    timeoutSeconds: 4
  readinessProbe:
    initialDelaySeconds: 16
    periodSeconds: 8
@@ -65,17 +60,12 @@ teirerank:
  # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
  resources:
    limits:
-      cpu: 4
+      cpu: 24
      memory: 30Gi
    requests:
      cpu: 2
      memory: 25Gi
  livenessProbe:
    initialDelaySeconds: 8
    periodSeconds: 8
    failureThreshold: 24
    timeoutSeconds: 4
  readinessProbe:
    initialDelaySeconds: 8
    periodSeconds: 8
@@ -91,17 +81,12 @@ tei:
  # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
  resources:
    limits:
-      cpu: 4
+      cpu: 24
      memory: 4Gi
    requests:
      cpu: 2
      memory: 3Gi
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    failureThreshold: 24
    timeoutSeconds: 2
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
--- a/ChatQnA/kubernetes/helm/faqgen-cpu-tgi-values.yaml
+++ b/ChatQnA/kubernetes/helm/faqgen-cpu-tgi-values.yaml
@@ -0,0 +1,16 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 CHATQNA_TYPE: "CHATQNA_FAQGEN"
 llm-uservice:
  enabled: true
  image:
    repository: opea/llm-faqgen
  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
  FAQGEN_BACKEND: "TGI"
  service:
    port: 80
 vllm:
  enabled: false
 tgi:
  enabled: true
--- a/ChatQnA/kubernetes/helm/faqgen-cpu-values.yaml
+++ b/ChatQnA/kubernetes/helm/faqgen-cpu-values.yaml
@@ -0,0 +1,12 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 CHATQNA_TYPE: "CHATQNA_FAQGEN"
 llm-uservice:
  enabled: true
  image:
    repository: opea/llm-faqgen
  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
  FAQGEN_BACKEND: "vLLM"
  service:
    port: 80
--- a/ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml
+++ b/ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml
@@ -0,0 +1,60 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 CHATQNA_TYPE: "CHATQNA_FAQGEN"
 llm-uservice:
  enabled: true
  image:
    repository: opea/llm-faqgen
  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
  FAQGEN_BACKEND: "TGI"
  service:
    port: 80
 vllm:
  enabled: false
 # TGI: largest bottleneck for ChatQnA
 tgi:
  enabled: true
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
  # higher limits are needed with extra input tokens added by rerank
  MAX_INPUT_LENGTH: "2048"
  MAX_TOTAL_TOKENS: "4096"
  CUDA_GRAPHS: ""
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  ENABLE_HPU_GRAPH: "true"
  LIMIT_HPU_GRAPH: "true"
  USE_FLASH_ATTENTION: "true"
  FLASH_ATTENTION_RECOMPUTE: "true"
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
 # Reranking: second largest bottleneck when reranking is in use
 # (i.e. query context docs have been uploaded with data-prep)
 teirerank:
  accelDevice: "gaudi"
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  MAX_WARMUP_SEQUENCE_LENGTH: "512"
  image:
    repository: ghcr.io/huggingface/tei-gaudi
    tag: 1.5.0
  resources:
    limits:
      habana.ai/gaudi: 1
 #  securityContext:
 #    readOnlyRootFilesystem: false
  readinessProbe:
    timeoutSeconds: 1
--- a/ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml
+++ b/ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml
@@ -0,0 +1,53 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 CHATQNA_TYPE: "CHATQNA_FAQGEN"
 llm-uservice:
  enabled: true
  image:
    repository: opea/llm-faqgen
  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
  FAQGEN_BACKEND: "vLLM"
  service:
    port: 80
 tgi:
  enabled: false
 vllm:
  enabled: true
  shmSize: 1Gi
  accelDevice: "gaudi"
  image:
    repository: opea/vllm-gaudi
  resources:
    limits:
      habana.ai/gaudi: 1
  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  VLLM_SKIP_WARMUP: true
  extraCmdArgs: [
    "--tensor-parallel-size", "1",
    "--block-size", "128",
    "--max-num-seqs", "256",
    "--max-seq_len-to-capture", "2048"
  ]
 # Reranking: second largest bottleneck when reranking is in use
 # (i.e. query context docs have been uploaded with data-prep)
 #
 # TODO: could vLLM be used also for reranking / embedding?
 teirerank:
  accelDevice: "gaudi"
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  MAX_WARMUP_SEQUENCE_LENGTH: "512"
  image:
    repository: ghcr.io/huggingface/tei-gaudi
    tag: 1.5.0
  resources:
    limits:
      habana.ai/gaudi: 1
  # securityContext:
  #   readOnlyRootFilesystem: false
  readinessProbe:
    timeoutSeconds: 1
--- a/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
+++ b/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
@@ -26,10 +26,6 @@ tgi:
  USE_FLASH_ATTENTION: "true"
  FLASH_ATTENTION_RECOMPUTE: "true"
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
@@ -54,8 +50,6 @@ teirerank:
      habana.ai/gaudi: 1
  securityContext:
    readOnlyRootFilesystem: false
  livenessProbe:
    timeoutSeconds: 1
  readinessProbe:
    timeoutSeconds: 1
@@ -73,7 +67,3 @@ teirerank:
 #       habana.ai/gaudi: 1
 #   securityContext:
 #     readOnlyRootFilesystem: false
 #   livenessProbe:
 #     timeoutSeconds: 1
 #   readinessProbe:
 #     timeoutSeconds: 1
--- a/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml
+++ b/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml
@@ -16,18 +16,7 @@ vllm:
    limits:
      habana.ai/gaudi: 1
  startupProbe:
-    initialDelaySeconds: 5
+    failureThreshold: 360
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 180
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
@@ -55,7 +44,5 @@ teirerank:
      habana.ai/gaudi: 1
  securityContext:
    readOnlyRootFilesystem: false
  livenessProbe:
    timeoutSeconds: 1
  readinessProbe:
    timeoutSeconds: 1
--- a/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
+++ b/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
@@ -1,9 +1,12 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-image:
+# Accelerate inferencing in heaviest components to improve performance
-  repository: opea/chatqna-guardrails
+# by overriding their subchart values
 image:
  repository: opea/chatqna
 CHATQNA_TYPE: "CHATQNA_GUARDRAILS"
 # guardrails related config
 guardrails-usvc:
  enabled: true
@@ -22,10 +25,6 @@ guardrails-usvc:
 #       habana.ai/gaudi: 1
 #   securityContext:
 #     readOnlyRootFilesystem: false
 #   livenessProbe:
 #     timeoutSeconds: 1
 #   readinessProbe:
 #     timeoutSeconds: 1
 teirerank:
  accelDevice: "gaudi"
@@ -39,8 +38,6 @@ teirerank:
      habana.ai/gaudi: 1
  securityContext:
    readOnlyRootFilesystem: false
  livenessProbe:
    timeoutSeconds: 1
  readinessProbe:
    timeoutSeconds: 1
@@ -62,10 +59,6 @@ tgi-guardrails:
  LIMIT_HPU_GRAPH: "true"
  USE_FLASH_ATTENTION: "true"
  FLASH_ATTENTION_RECOMPUTE: "true"
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
@@ -88,18 +81,7 @@ vllm:
    limits:
      habana.ai/gaudi: 1
  startupProbe:
-    initialDelaySeconds: 5
+    failureThreshold: 360
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 180
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
--- a/ChatQnA/kubernetes/helm/norerank-values.yaml
+++ b/ChatQnA/kubernetes/helm/norerank-values.yaml
@@ -1,11 +1,7 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-# Accelerate inferencing in heaviest components to improve performance
+CHATQNA_TYPE: "CHATQNA_NO_RERANK"
 # by overriding their subchart values
 image:
  repository: opea/chatqna-without-rerank
 teirerank:
  enabled: false
--- a/CodeGen/kubernetes/helm/cpu-tgi-values.yaml
+++ b/CodeGen/kubernetes/helm/cpu-tgi-values.yaml
@@ -0,0 +1,9 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 tgi:
  enabled: true
 vllm:
  enabled: false
 llm-uservice:
  TEXTGEN_BACKEND: TGI
--- a/CodeGen/kubernetes/helm/cpu-values.yaml
+++ b/CodeGen/kubernetes/helm/cpu-values.yaml
@@ -2,4 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 tgi:
-  LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
+  enabled: false
 vllm:
  enabled: true
 llm-uservice:
  TEXTGEN_BACKEND: vLLM
--- a/CodeGen/kubernetes/helm/gaudi-tgi-values.yaml
+++ b/CodeGen/kubernetes/helm/gaudi-tgi-values.yaml
@@ -0,0 +1,33 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 tgi:
  enabled: true
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
  MAX_INPUT_LENGTH: "1024"
  MAX_TOTAL_TOKENS: "2048"
  CUDA_GRAPHS: ""
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  ENABLE_HPU_GRAPH: "true"
  LIMIT_HPU_GRAPH: "true"
  USE_FLASH_ATTENTION: "true"
  FLASH_ATTENTION_RECOMPUTE: "true"
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
 vllm:
  enabled: false
 llm-uservice:
  TEXTGEN_BACKEND: TGI
--- a/CodeGen/kubernetes/helm/gaudi-values.yaml
+++ b/CodeGen/kubernetes/helm/gaudi-values.yaml
@@ -2,32 +2,26 @@
 # SPDX-License-Identifier: Apache-2.0
 tgi:
  enabled: false
 vllm:
  enabled: true
  accelDevice: "gaudi"
  LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
  image:
-    repository: ghcr.io/huggingface/tgi-gaudi
+    repository: opea/vllm-gaudi
-    tag: "2.3.1"
+  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  startupProbe:
    failureThreshold: 360
  resources:
    limits:
      habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "1024"
+  extraCmdArgs: [
-  MAX_TOTAL_TOKENS: "2048"
+    "--tensor-parallel-size", "1",
-  CUDA_GRAPHS: ""
+    "--block-size", "128",
-  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+    "--max-num-seqs", "256",
-  ENABLE_HPU_GRAPH: "true"
+  ]
-  LIMIT_HPU_GRAPH: "true"
+
-  USE_FLASH_ATTENTION: "true"
+llm-uservice:
-  FLASH_ATTENTION_RECOMPUTE: "true"
+  TEXTGEN_BACKEND: vLLM
-  livenessProbe:
+  retryTimeoutSeconds: 720
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
--- a/CodeTrans/kubernetes/helm/cpu-tgi-values.yaml
+++ b/CodeTrans/kubernetes/helm/cpu-tgi-values.yaml
@@ -0,0 +1,9 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 tgi:
  enabled: true
 vllm:
  enabled: false
 llm-uservice:
  TEXTGEN_BACKEND: TGI
--- a/CodeTrans/kubernetes/helm/cpu-values.yaml
+++ b/CodeTrans/kubernetes/helm/cpu-values.yaml
@@ -2,4 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 tgi:
-  LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3
+  enabled: false
 vllm:
  enabled: true
 llm-uservice:
  TEXTGEN_BACKEND: vLLM
--- a/CodeTrans/kubernetes/helm/gaudi-tgi-values.yaml
+++ b/CodeTrans/kubernetes/helm/gaudi-tgi-values.yaml
@@ -0,0 +1,33 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 tgi:
  enabled: true
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
  MAX_INPUT_LENGTH: "2048"
  MAX_TOTAL_TOKENS: "4096"
  CUDA_GRAPHS: ""
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  ENABLE_HPU_GRAPH: "true"
  LIMIT_HPU_GRAPH: "true"
  USE_FLASH_ATTENTION: "true"
  FLASH_ATTENTION_RECOMPUTE: "true"
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
 vllm:
  enabled: false
 llm-uservice:
  TEXTGEN_BACKEND: TGI
--- a/CodeTrans/kubernetes/helm/gaudi-values.yaml
+++ b/CodeTrans/kubernetes/helm/gaudi-values.yaml
@@ -1,32 +1,33 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values
 tgi:
  enabled: false
 vllm:
  enabled: true
  accelDevice: "gaudi"
  image:
-    repository: ghcr.io/huggingface/tgi-gaudi
+    repository: opea/vllm-gaudi
-    tag: "2.3.1"
+  startupProbe:
    failureThreshold: 360
  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  resources:
    limits:
      habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "1024"
+  extraCmdArgs: [
-  MAX_TOTAL_TOKENS: "2048"
+    "--tensor-parallel-size", "1",
-  CUDA_GRAPHS: ""
+    "--block-size", "128",
-  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+    "--max-num-seqs", "256",
-  ENABLE_HPU_GRAPH: "true"
+    "--max-seq_len-to-capture", "2048"
-  LIMIT_HPU_GRAPH: "true"
+  ]
-  USE_FLASH_ATTENTION: "true"
+
-  FLASH_ATTENTION_RECOMPUTE: "true"
+llm-uservice:
-  livenessProbe:
+  TEXTGEN_BACKEND: vLLM
-    initialDelaySeconds: 5
+  retryTimeoutSeconds: 720
    periodSeconds: 5
    timeoutSeconds: 1
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
--- a/DocSum/kubernetes/helm/cpu-tgi-values.yaml
+++ b/DocSum/kubernetes/helm/cpu-tgi-values.yaml
@@ -0,0 +1,9 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 llm-uservice:
  DOCSUM_BACKEND: "TGI"
 tgi:
  enabled: true
 vllm:
  enabled: false
--- a/DocSum/kubernetes/helm/cpu-values.yaml
+++ b/DocSum/kubernetes/helm/cpu-values.yaml
@@ -1,7 +1,9 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 llm-uservice:
  DOCSUM_BACKEND: "vLLM"
 tgi:
  enabled: true
 vllm:
  enabled: false
 vllm:
  enabled: true
--- a/DocSum/kubernetes/helm/gaudi-tgi-values.yaml
+++ b/DocSum/kubernetes/helm/gaudi-tgi-values.yaml
@@ -0,0 +1,32 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 vllm:
  enabled: false
 llm-uservice:
  DOCSUM_BACKEND: "TGI"
 tgi:
  enabled: true
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
  CUDA_GRAPHS: ""
  ENABLE_HPU_GRAPH: true
  LIMIT_HPU_GRAPH: true
  USE_FLASH_ATTENTION: true
  FLASH_ATTENTION_RECOMPUTE: true
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
--- a/DocSum/kubernetes/helm/gaudi-values.yaml
+++ b/DocSum/kubernetes/helm/gaudi-values.yaml
@@ -1,36 +1,32 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-vllm:
+# Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values
 tgi:
  enabled: false
 llm-uservice:
-  DOCSUM_BACKEND: "TGI"
+  DOCSUM_BACKEND: "vLLM"
  retryTimeoutSeconds: 720
-tgi:
+vllm:
  enabled: true
  accelDevice: "gaudi"
  image:
-    repository: ghcr.io/huggingface/tgi-gaudi
+    repository: opea/vllm-gaudi
    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
  CUDA_GRAPHS: ""
  ENABLE_HPU_GRAPH: true
  LIMIT_HPU_GRAPH: true
  USE_FLASH_ATTENTION: true
  FLASH_ATTENTION_RECOMPUTE: true
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
-    initialDelaySeconds: 5
+    failureThreshold: 360
-    periodSeconds: 5
+
-    timeoutSeconds: 1
+  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
-    failureThreshold: 120
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  extraCmdArgs: [
    "--tensor-parallel-size", "1",
    "--block-size", "128",
    "--max-num-seqs", "256",
    "--max-seq_len-to-capture", "2048"
  ]
--- a/SearchQnA/kubernetes/helm/gaudi-values.yaml
+++ b/SearchQnA/kubernetes/helm/gaudi-values.yaml
@@ -18,10 +18,6 @@ tgi:
  LIMIT_HPU_GRAPH: true
  USE_FLASH_ATTENTION: true
  FLASH_ATTENTION_RECOMPUTE: true
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
@@ -44,7 +40,5 @@ tei:
  resources:
    limits:
      habana.ai/gaudi: 1
  livenessProbe:
    timeoutSeconds: 1
  readinessProbe:
    timeoutSeconds: 1
--- a/VisualQnA/kubernetes/helm/cpu-tgi-values.yaml
+++ b/VisualQnA/kubernetes/helm/cpu-tgi-values.yaml
@@ -0,0 +1,9 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 tgi:
  enabled: true
 vllm:
  enabled: false
 lvm-uservice:
  LVM_BACKEND: "TGI"
--- a/VisualQnA/kubernetes/helm/cpu-values.yaml
+++ b/VisualQnA/kubernetes/helm/cpu-values.yaml
@@ -1,7 +1,9 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 vllm:
  enabled: true
 tgi:
-  MAX_INPUT_LENGTH: "4096"
+  enabled: false
-  MAX_TOTAL_TOKENS: "8192"
+lvm-uservice:
-  LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf
+  LVM_BACKEND: "vLLM"
--- a/VisualQnA/kubernetes/helm/gaudi-tgi-values.yaml
+++ b/VisualQnA/kubernetes/helm/gaudi-tgi-values.yaml
@@ -0,0 +1,37 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values
 # TGI: largest bottleneck for VisualQnA
 tgi:
  enabled: true
  accelDevice: "gaudi"
  image:
    repository: ghcr.io/huggingface/tgi-gaudi
    tag: "2.3.1"
  resources:
    limits:
      habana.ai/gaudi: 1
  MAX_INPUT_LENGTH: "4096"
  MAX_TOTAL_TOKENS: "8192"
  CUDA_GRAPHS: ""
  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
  ENABLE_HPU_GRAPH: "true"
  LIMIT_HPU_GRAPH: "true"
  USE_FLASH_ATTENTION: "true"
  FLASH_ATTENTION_RECOMPUTE: "true"
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120
 vllm:
  enabled: false
 lvm-uservice:
  LVM_BACKEND: "TGI"
--- a/VisualQnA/kubernetes/helm/gaudi-values.yaml
+++ b/VisualQnA/kubernetes/helm/gaudi-values.yaml
@@ -1,36 +1,24 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-# Accelerate inferencing in heaviest components to improve performance
+vllm:
-# by overriding their subchart values
+  enabled: true
 # TGI: largest bottleneck for VisualQnA
 tgi:
  accelDevice: "gaudi"
  image:
-    repository: ghcr.io/huggingface/tgi-gaudi
+    repository: opea/vllm-gaudi
-    tag: "2.3.1"
+    tag: "latest"
  LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
  VLLM_SKIP_WARMUP: true
  OMPI_MCA_btl_vader_single_copy_mechanism: none
  PT_HPUGRAPH_DISABLE_TENSOR_CACHE: "false"
  extraCmdArgs: ["--tensor-parallel-size","1","--chat-template","examples/template_llava.jinja"]
  resources:
    limits:
      habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "4096"
+tgi:
-  MAX_TOTAL_TOKENS: "8192"
+  enabled: false
-  CUDA_GRAPHS: ""
+lvm-uservice:
-  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  LVM_BACKEND: "vLLM"
-  ENABLE_HPU_GRAPH: "true"
+  # The default model is not stable on Gaudi, use the older model.
-  LIMIT_HPU_GRAPH: "true"
+  # https://github.com/HabanaAI/vllm-fork/issues/841
-  USE_FLASH_ATTENTION: "true"
+  LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
  FLASH_ATTENTION_RECOMPUTE: "true"
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
  startupProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 1
    failureThreshold: 120