From 46ebb78aa3c76d10b9461f011f69fd80cd99833f Mon Sep 17 00:00:00 2001
From: dolpher <dolpher.du@intel.com>
Date: Tue, 8 Apr 2025 22:39:40 +0800
Subject: [PATCH] Sync values yaml file for 1.3 release (#1748)

Signed-off-by: Dolpher Du <dolpher.du@intel.com>
---
 AgentQnA/kubernetes/helm/cpu-values.yaml      | 22 +++++++
 AgentQnA/kubernetes/helm/gaudi-values.yaml    | 19 ++++++
 .../kubernetes/helm/cpu-multilang-values.yaml | 15 +++++
 AudioQnA/kubernetes/helm/cpu-tgi-values.yaml  | 12 ++++
 AudioQnA/kubernetes/helm/cpu-values.yaml      |  9 ++-
 .../kubernetes/helm/gaudi-tgi-values.yaml     | 49 +++++++++++++++
 AudioQnA/kubernetes/helm/gaudi-values.yaml    | 43 ++++++-------
 .../kubernetes/helm/cpu-qdrant-values.yaml    | 12 ++++
 ChatQnA/kubernetes/helm/cpu-tgi-values.yaml   | 19 +-----
 .../helm/faqgen-cpu-tgi-values.yaml           | 16 +++++
 .../kubernetes/helm/faqgen-cpu-values.yaml    | 12 ++++
 .../helm/faqgen-gaudi-tgi-values.yaml         | 60 +++++++++++++++++++
 .../kubernetes/helm/faqgen-gaudi-values.yaml  | 53 ++++++++++++++++
 ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml | 10 ----
 ...udi-vllm-values.yaml => gaudi-values.yaml} | 15 +----
 .../helm/guardrails-gaudi-values.yaml         | 32 +++-------
 ChatQnA/kubernetes/helm/norerank-values.yaml  |  8 +--
 CodeGen/kubernetes/helm/cpu-tgi-values.yaml   |  9 +++
 CodeGen/kubernetes/helm/cpu-values.yaml       |  6 +-
 CodeGen/kubernetes/helm/gaudi-tgi-values.yaml | 33 ++++++++++
 CodeGen/kubernetes/helm/gaudi-values.yaml     | 42 ++++++-------
 CodeTrans/kubernetes/helm/cpu-tgi-values.yaml |  9 +++
 CodeTrans/kubernetes/helm/cpu-values.yaml     |  6 +-
 .../kubernetes/helm/gaudi-tgi-values.yaml     | 33 ++++++++++
 CodeTrans/kubernetes/helm/gaudi-values.yaml   | 47 ++++++++-------
 DocSum/kubernetes/helm/cpu-tgi-values.yaml    |  9 +++
 DocSum/kubernetes/helm/cpu-values.yaml        |  8 ++-
 DocSum/kubernetes/helm/gaudi-tgi-values.yaml  | 32 ++++++++++
 DocSum/kubernetes/helm/gaudi-values.yaml      | 44 +++++++-------
 SearchQnA/kubernetes/helm/gaudi-values.yaml   |  6 --
 VisualQnA/kubernetes/helm/cpu-tgi-values.yaml |  9 +++
 VisualQnA/kubernetes/helm/cpu-values.yaml     | 10 ++--
 .../kubernetes/helm/gaudi-tgi-values.yaml     | 37 ++++++++++++
 VisualQnA/kubernetes/helm/gaudi-values.yaml   | 46 ++++++--------
 34 files changed, 580 insertions(+), 212 deletions(-)
 create mode 100644 AgentQnA/kubernetes/helm/cpu-values.yaml
 create mode 100644 AudioQnA/kubernetes/helm/cpu-multilang-values.yaml
 create mode 100644 AudioQnA/kubernetes/helm/cpu-tgi-values.yaml
 create mode 100644 AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml
 create mode 100644 ChatQnA/kubernetes/helm/cpu-qdrant-values.yaml
 create mode 100644 ChatQnA/kubernetes/helm/faqgen-cpu-tgi-values.yaml
 create mode 100644 ChatQnA/kubernetes/helm/faqgen-cpu-values.yaml
 create mode 100644 ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml
 create mode 100644 ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml
 rename ChatQnA/kubernetes/helm/{gaudi-vllm-values.yaml => gaudi-values.yaml} (78%)
 create mode 100644 CodeGen/kubernetes/helm/cpu-tgi-values.yaml
 create mode 100644 CodeGen/kubernetes/helm/gaudi-tgi-values.yaml
 create mode 100644 CodeTrans/kubernetes/helm/cpu-tgi-values.yaml
 create mode 100644 CodeTrans/kubernetes/helm/gaudi-tgi-values.yaml
 create mode 100644 DocSum/kubernetes/helm/cpu-tgi-values.yaml
 create mode 100644 DocSum/kubernetes/helm/gaudi-tgi-values.yaml
 create mode 100644 VisualQnA/kubernetes/helm/cpu-tgi-values.yaml
 create mode 100644 VisualQnA/kubernetes/helm/gaudi-tgi-values.yaml

diff --git a/AgentQnA/kubernetes/helm/cpu-values.yaml b/AgentQnA/kubernetes/helm/cpu-values.yaml
new file mode 100644
index 000000000..1def3bca9
--- /dev/null
+++ b/AgentQnA/kubernetes/helm/cpu-values.yaml
@@ -0,0 +1,22 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  enabled: false
+vllm:
+  enabled: true
+  LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
+  extraCmdArgs: ["--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
+
+supervisor:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+  llm_engine: vllm
+  model: "meta-llama/Meta-Llama-3-8B-Instruct"
+ragagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+  llm_engine: vllm
+  model: "meta-llama/Meta-Llama-3-8B-Instruct"
+sqlagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
+  llm_engine: vllm
+  model: "meta-llama/Meta-Llama-3-8B-Instruct"
diff --git a/AgentQnA/kubernetes/helm/gaudi-values.yaml b/AgentQnA/kubernetes/helm/gaudi-values.yaml
index 2d171ea22..fc040abc0 100644
--- a/AgentQnA/kubernetes/helm/gaudi-values.yaml
+++ b/AgentQnA/kubernetes/helm/gaudi-values.yaml
@@ -4,13 +4,32 @@
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values
 
+tgi:
+  enabled: false
 vllm:
   enabled: true
+  accelDevice: "gaudi"
   image:
     repository: opea/vllm-gaudi
+  resources:
+    limits:
+      habana.ai/gaudi: 4
+  LLM_MODEL_ID: "meta-llama/Llama-3.3-70B-Instruct"
+  OMPI_MCA_btl_vader_single_copy_mechanism: none
+  PT_HPU_ENABLE_LAZY_COLLECTIVES: true
+  VLLM_SKIP_WARMUP: true
+  shmSize: 16Gi
+  extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
+
 supervisor:
   llm_endpoint_url: http://{{ .Release.Name }}-vllm
+  llm_engine: vllm
+  model: "meta-llama/Llama-3.3-70B-Instruct"
 ragagent:
   llm_endpoint_url: http://{{ .Release.Name }}-vllm
+  llm_engine: vllm
+  model: "meta-llama/Llama-3.3-70B-Instruct"
 sqlagent:
   llm_endpoint_url: http://{{ .Release.Name }}-vllm
+  llm_engine: vllm
+  model: "meta-llama/Llama-3.3-70B-Instruct"
diff --git a/AudioQnA/kubernetes/helm/cpu-multilang-values.yaml b/AudioQnA/kubernetes/helm/cpu-multilang-values.yaml
new file mode 100644
index 000000000..84ff9161b
--- /dev/null
+++ b/AudioQnA/kubernetes/helm/cpu-multilang-values.yaml
@@ -0,0 +1,15 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  enabled: false
+vllm:
+  enabled: true
+
+speecht5:
+  enabled: false
+gpt-sovits:
+  enabled: true
+
+image:
+  repository: opea/audioqna-multilang
diff --git a/AudioQnA/kubernetes/helm/cpu-tgi-values.yaml b/AudioQnA/kubernetes/helm/cpu-tgi-values.yaml
new file mode 100644
index 000000000..ada4ee984
--- /dev/null
+++ b/AudioQnA/kubernetes/helm/cpu-tgi-values.yaml
@@ -0,0 +1,12 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  enabled: true
+vllm:
+  enabled: false
+
+speecht5:
+  enabled: true
+gpt-sovits:
+  enabled: false
diff --git a/AudioQnA/kubernetes/helm/cpu-values.yaml b/AudioQnA/kubernetes/helm/cpu-values.yaml
index 97818ae44..5a3c42335 100644
--- a/AudioQnA/kubernetes/helm/cpu-values.yaml
+++ b/AudioQnA/kubernetes/helm/cpu-values.yaml
@@ -2,4 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 tgi:
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+  enabled: false
+vllm:
+  enabled: true
+
+speecht5:
+  enabled: true
+gpt-sovits:
+  enabled: false
diff --git a/AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml b/AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml
new file mode 100644
index 000000000..03e209306
--- /dev/null
+++ b/AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml
@@ -0,0 +1,49 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  enabled: true
+  accelDevice: "gaudi"
+  image:
+    repository: ghcr.io/huggingface/tgi-gaudi
+    tag: "2.3.1"
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+  MAX_INPUT_LENGTH: "1024"
+  MAX_TOTAL_TOKENS: "2048"
+  CUDA_GRAPHS: ""
+  HF_HUB_DISABLE_PROGRESS_BARS: 1
+  HF_HUB_ENABLE_HF_TRANSFER: 0
+  ENABLE_HPU_GRAPH: true
+  LIMIT_HPU_GRAPH: true
+  USE_FLASH_ATTENTION: true
+  FLASH_ATTENTION_RECOMPUTE: true
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+vllm:
+  enabled: false
+
+whisper:
+  image:
+    repository: opea/whisper-gaudi
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+
+speecht5:
+  enabled: true
+  image:
+    repository: opea/speecht5-gaudi
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+gpt-sovits:
+  enabled: false
diff --git a/AudioQnA/kubernetes/helm/gaudi-values.yaml b/AudioQnA/kubernetes/helm/gaudi-values.yaml
index 9b06ff429..90fb43293 100644
--- a/AudioQnA/kubernetes/helm/gaudi-values.yaml
+++ b/AudioQnA/kubernetes/helm/gaudi-values.yaml
@@ -2,35 +2,27 @@
 # SPDX-License-Identifier: Apache-2.0
 
 tgi:
+  enabled: false
+vllm:
+  enabled: true
   accelDevice: "gaudi"
   image:
-    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.3.1"
+    repository: opea/vllm-gaudi
+  startupProbe:
+    failureThreshold: 360
+
+  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+
   resources:
     limits:
       habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "1024"
-  MAX_TOTAL_TOKENS: "2048"
-  CUDA_GRAPHS: ""
-  HF_HUB_DISABLE_PROGRESS_BARS: 1
-  HF_HUB_ENABLE_HF_TRANSFER: 0
-  ENABLE_HPU_GRAPH: true
-  LIMIT_HPU_GRAPH: true
-  USE_FLASH_ATTENTION: true
-  FLASH_ATTENTION_RECOMPUTE: true
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
+  extraCmdArgs: [
+    "--tensor-parallel-size", "1",
+    "--block-size", "128",
+    "--max-num-seqs", "256",
+    "--max-seq_len-to-capture", "2048"
+  ]
 
 whisper:
   image:
@@ -40,8 +32,11 @@ whisper:
       habana.ai/gaudi: 1
 
 speecht5:
+  enabled: true
   image:
     repository: opea/speecht5-gaudi
   resources:
     limits:
       habana.ai/gaudi: 1
+gpt-sovits:
+  enabled: false
diff --git a/ChatQnA/kubernetes/helm/cpu-qdrant-values.yaml b/ChatQnA/kubernetes/helm/cpu-qdrant-values.yaml
new file mode 100644
index 000000000..ac2865b17
--- /dev/null
+++ b/ChatQnA/kubernetes/helm/cpu-qdrant-values.yaml
@@ -0,0 +1,12 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+redis-vector-db:
+  enabled: false
+qdrant:
+  enabled: true
+
+data-prep:
+  DATAPREP_BACKEND: "QDRANT"
+retriever-usvc:
+  RETRIEVER_BACKEND: "QDRANT"
diff --git a/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml b/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
index f552e1d5b..b80c44f04 100644
--- a/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
+++ b/ChatQnA/kubernetes/helm/cpu-tgi-values.yaml
@@ -44,11 +44,6 @@ tgi:
   #    cpu: 6
   #    memory: 65Gi
 
-  livenessProbe:
-    initialDelaySeconds: 8
-    periodSeconds: 8
-    failureThreshold: 24
-    timeoutSeconds: 4
   readinessProbe:
     initialDelaySeconds: 16
     periodSeconds: 8
@@ -65,17 +60,12 @@ teirerank:
   # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
   resources:
     limits:
-      cpu: 4
+      cpu: 24
       memory: 30Gi
     requests:
       cpu: 2
       memory: 25Gi
 
-  livenessProbe:
-    initialDelaySeconds: 8
-    periodSeconds: 8
-    failureThreshold: 24
-    timeoutSeconds: 4
   readinessProbe:
     initialDelaySeconds: 8
     periodSeconds: 8
@@ -91,17 +81,12 @@ tei:
   # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
   resources:
     limits:
-      cpu: 4
+      cpu: 24
       memory: 4Gi
     requests:
       cpu: 2
       memory: 3Gi
 
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    failureThreshold: 24
-    timeoutSeconds: 2
   readinessProbe:
     initialDelaySeconds: 5
     periodSeconds: 5
diff --git a/ChatQnA/kubernetes/helm/faqgen-cpu-tgi-values.yaml b/ChatQnA/kubernetes/helm/faqgen-cpu-tgi-values.yaml
new file mode 100644
index 000000000..6ce475595
--- /dev/null
+++ b/ChatQnA/kubernetes/helm/faqgen-cpu-tgi-values.yaml
@@ -0,0 +1,16 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+CHATQNA_TYPE: "CHATQNA_FAQGEN"
+llm-uservice:
+  enabled: true
+  image:
+    repository: opea/llm-faqgen
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+  FAQGEN_BACKEND: "TGI"
+  service:
+    port: 80
+vllm:
+  enabled: false
+tgi:
+  enabled: true
diff --git a/ChatQnA/kubernetes/helm/faqgen-cpu-values.yaml b/ChatQnA/kubernetes/helm/faqgen-cpu-values.yaml
new file mode 100644
index 000000000..2f05eeed3
--- /dev/null
+++ b/ChatQnA/kubernetes/helm/faqgen-cpu-values.yaml
@@ -0,0 +1,12 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+CHATQNA_TYPE: "CHATQNA_FAQGEN"
+llm-uservice:
+  enabled: true
+  image:
+    repository: opea/llm-faqgen
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+  FAQGEN_BACKEND: "vLLM"
+  service:
+    port: 80
diff --git a/ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml b/ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml
new file mode 100644
index 000000000..88fca4ed5
--- /dev/null
+++ b/ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml
@@ -0,0 +1,60 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+CHATQNA_TYPE: "CHATQNA_FAQGEN"
+llm-uservice:
+  enabled: true
+  image:
+    repository: opea/llm-faqgen
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+  FAQGEN_BACKEND: "TGI"
+  service:
+    port: 80
+vllm:
+  enabled: false
+# TGI: largest bottleneck for ChatQnA
+tgi:
+  enabled: true
+  accelDevice: "gaudi"
+  image:
+    repository: ghcr.io/huggingface/tgi-gaudi
+    tag: "2.3.1"
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+  # higher limits are needed with extra input tokens added by rerank
+  MAX_INPUT_LENGTH: "2048"
+  MAX_TOTAL_TOKENS: "4096"
+  CUDA_GRAPHS: ""
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
+
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+
+# Reranking: second largest bottleneck when reranking is in use
+# (i.e. query context docs have been uploaded with data-prep)
+teirerank:
+  accelDevice: "gaudi"
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  MAX_WARMUP_SEQUENCE_LENGTH: "512"
+  image:
+    repository: ghcr.io/huggingface/tei-gaudi
+    tag: 1.5.0
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+#  securityContext:
+#    readOnlyRootFilesystem: false
+  readinessProbe:
+    timeoutSeconds: 1
diff --git a/ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml b/ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml
new file mode 100644
index 000000000..de52226f4
--- /dev/null
+++ b/ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml
@@ -0,0 +1,53 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+CHATQNA_TYPE: "CHATQNA_FAQGEN"
+llm-uservice:
+  enabled: true
+  image:
+    repository: opea/llm-faqgen
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+  FAQGEN_BACKEND: "vLLM"
+  service:
+    port: 80
+tgi:
+  enabled: false
+vllm:
+  enabled: true
+  shmSize: 1Gi
+  accelDevice: "gaudi"
+  image:
+    repository: opea/vllm-gaudi
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+
+  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  VLLM_SKIP_WARMUP: true
+
+  extraCmdArgs: [
+    "--tensor-parallel-size", "1",
+    "--block-size", "128",
+    "--max-num-seqs", "256",
+    "--max-seq_len-to-capture", "2048"
+  ]
+
+# Reranking: second largest bottleneck when reranking is in use
+# (i.e. query context docs have been uploaded with data-prep)
+#
+# TODO: could vLLM be used also for reranking / embedding?
+teirerank:
+  accelDevice: "gaudi"
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  MAX_WARMUP_SEQUENCE_LENGTH: "512"
+  image:
+    repository: ghcr.io/huggingface/tei-gaudi
+    tag: 1.5.0
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+  # securityContext:
+  #   readOnlyRootFilesystem: false
+  readinessProbe:
+    timeoutSeconds: 1
diff --git a/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml b/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
index d4da00c97..027fecb3f 100644
--- a/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
+++ b/ChatQnA/kubernetes/helm/gaudi-tgi-values.yaml
@@ -26,10 +26,6 @@ tgi:
   USE_FLASH_ATTENTION: "true"
   FLASH_ATTENTION_RECOMPUTE: "true"
 
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
   readinessProbe:
     initialDelaySeconds: 5
     periodSeconds: 5
@@ -54,8 +50,6 @@ teirerank:
       habana.ai/gaudi: 1
   securityContext:
     readOnlyRootFilesystem: false
-  livenessProbe:
-    timeoutSeconds: 1
   readinessProbe:
     timeoutSeconds: 1
 
@@ -73,7 +67,3 @@ teirerank:
 #       habana.ai/gaudi: 1
 #   securityContext:
 #     readOnlyRootFilesystem: false
-#   livenessProbe:
-#     timeoutSeconds: 1
-#   readinessProbe:
-#     timeoutSeconds: 1
diff --git a/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml b/ChatQnA/kubernetes/helm/gaudi-values.yaml
similarity index 78%
rename from ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml
rename to ChatQnA/kubernetes/helm/gaudi-values.yaml
index 76eafae02..2bf9f3dab 100644
--- a/ChatQnA/kubernetes/helm/gaudi-vllm-values.yaml
+++ b/ChatQnA/kubernetes/helm/gaudi-values.yaml
@@ -16,18 +16,7 @@ vllm:
     limits:
       habana.ai/gaudi: 1
   startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 180
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
+    failureThreshold: 360
 
   PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
   OMPI_MCA_btl_vader_single_copy_mechanism: "none"
@@ -55,7 +44,5 @@ teirerank:
       habana.ai/gaudi: 1
   securityContext:
     readOnlyRootFilesystem: false
-  livenessProbe:
-    timeoutSeconds: 1
   readinessProbe:
     timeoutSeconds: 1
diff --git a/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml b/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
index 8e8a491a0..dc025b386 100644
--- a/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
+++ b/ChatQnA/kubernetes/helm/guardrails-gaudi-values.yaml
@@ -1,9 +1,12 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-image:
-  repository: opea/chatqna-guardrails
+# Accelerate inferencing in heaviest components to improve performance
+# by overriding their subchart values
 
+image:
+  repository: opea/chatqna
+CHATQNA_TYPE: "CHATQNA_GUARDRAILS"
 # guardrails related config
 guardrails-usvc:
   enabled: true
@@ -22,10 +25,6 @@ guardrails-usvc:
 #       habana.ai/gaudi: 1
 #   securityContext:
 #     readOnlyRootFilesystem: false
-#   livenessProbe:
-#     timeoutSeconds: 1
-#   readinessProbe:
-#     timeoutSeconds: 1
 
 teirerank:
   accelDevice: "gaudi"
@@ -39,8 +38,6 @@ teirerank:
       habana.ai/gaudi: 1
   securityContext:
     readOnlyRootFilesystem: false
-  livenessProbe:
-    timeoutSeconds: 1
   readinessProbe:
     timeoutSeconds: 1
 
@@ -62,10 +59,6 @@ tgi-guardrails:
   LIMIT_HPU_GRAPH: "true"
   USE_FLASH_ATTENTION: "true"
   FLASH_ATTENTION_RECOMPUTE: "true"
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
   readinessProbe:
     initialDelaySeconds: 5
     periodSeconds: 5
@@ -88,18 +81,7 @@ vllm:
     limits:
       habana.ai/gaudi: 1
   startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 180
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
+    failureThreshold: 360
 
   PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
   OMPI_MCA_btl_vader_single_copy_mechanism: "none"
diff --git a/ChatQnA/kubernetes/helm/norerank-values.yaml b/ChatQnA/kubernetes/helm/norerank-values.yaml
index aa069c1c7..f3bcc8b47 100644
--- a/ChatQnA/kubernetes/helm/norerank-values.yaml
+++ b/ChatQnA/kubernetes/helm/norerank-values.yaml
@@ -1,11 +1,7 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-# Accelerate inferencing in heaviest components to improve performance
-# by overriding their subchart values
-
-image:
-  repository: opea/chatqna-without-rerank
+CHATQNA_TYPE: "CHATQNA_NO_RERANK"
 
 teirerank:
   enabled: false
diff --git a/CodeGen/kubernetes/helm/cpu-tgi-values.yaml b/CodeGen/kubernetes/helm/cpu-tgi-values.yaml
new file mode 100644
index 000000000..accd8674b
--- /dev/null
+++ b/CodeGen/kubernetes/helm/cpu-tgi-values.yaml
@@ -0,0 +1,9 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  enabled: true
+vllm:
+  enabled: false
+llm-uservice:
+  TEXTGEN_BACKEND: TGI
diff --git a/CodeGen/kubernetes/helm/cpu-values.yaml b/CodeGen/kubernetes/helm/cpu-values.yaml
index b49541359..ccc843b1c 100644
--- a/CodeGen/kubernetes/helm/cpu-values.yaml
+++ b/CodeGen/kubernetes/helm/cpu-values.yaml
@@ -2,4 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 tgi:
-  LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
+  enabled: false
+vllm:
+  enabled: true
+llm-uservice:
+  TEXTGEN_BACKEND: vLLM
diff --git a/CodeGen/kubernetes/helm/gaudi-tgi-values.yaml b/CodeGen/kubernetes/helm/gaudi-tgi-values.yaml
new file mode 100644
index 000000000..7aaae3f27
--- /dev/null
+++ b/CodeGen/kubernetes/helm/gaudi-tgi-values.yaml
@@ -0,0 +1,33 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  enabled: true
+  accelDevice: "gaudi"
+  image:
+    repository: ghcr.io/huggingface/tgi-gaudi
+    tag: "2.3.1"
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+  MAX_INPUT_LENGTH: "1024"
+  MAX_TOTAL_TOKENS: "2048"
+  CUDA_GRAPHS: ""
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+vllm:
+  enabled: false
+llm-uservice:
+  TEXTGEN_BACKEND: TGI
diff --git a/CodeGen/kubernetes/helm/gaudi-values.yaml b/CodeGen/kubernetes/helm/gaudi-values.yaml
index 25ac2c395..95fcce29c 100644
--- a/CodeGen/kubernetes/helm/gaudi-values.yaml
+++ b/CodeGen/kubernetes/helm/gaudi-values.yaml
@@ -2,32 +2,26 @@
 # SPDX-License-Identifier: Apache-2.0
 
 tgi:
+  enabled: false
+
+vllm:
+  enabled: true
   accelDevice: "gaudi"
-  LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
   image:
-    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.3.1"
+    repository: opea/vllm-gaudi
+  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  startupProbe:
+    failureThreshold: 360
   resources:
     limits:
       habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "1024"
-  MAX_TOTAL_TOKENS: "2048"
-  CUDA_GRAPHS: ""
-  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-  ENABLE_HPU_GRAPH: "true"
-  LIMIT_HPU_GRAPH: "true"
-  USE_FLASH_ATTENTION: "true"
-  FLASH_ATTENTION_RECOMPUTE: "true"
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
+  extraCmdArgs: [
+    "--tensor-parallel-size", "1",
+    "--block-size", "128",
+    "--max-num-seqs", "256",
+  ]
+
+llm-uservice:
+  TEXTGEN_BACKEND: vLLM
+  retryTimeoutSeconds: 720
diff --git a/CodeTrans/kubernetes/helm/cpu-tgi-values.yaml b/CodeTrans/kubernetes/helm/cpu-tgi-values.yaml
new file mode 100644
index 000000000..accd8674b
--- /dev/null
+++ b/CodeTrans/kubernetes/helm/cpu-tgi-values.yaml
@@ -0,0 +1,9 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  enabled: true
+vllm:
+  enabled: false
+llm-uservice:
+  TEXTGEN_BACKEND: TGI
diff --git a/CodeTrans/kubernetes/helm/cpu-values.yaml b/CodeTrans/kubernetes/helm/cpu-values.yaml
index 313f05075..ccc843b1c 100644
--- a/CodeTrans/kubernetes/helm/cpu-values.yaml
+++ b/CodeTrans/kubernetes/helm/cpu-values.yaml
@@ -2,4 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 tgi:
-  LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3
+  enabled: false
+vllm:
+  enabled: true
+llm-uservice:
+  TEXTGEN_BACKEND: vLLM
diff --git a/CodeTrans/kubernetes/helm/gaudi-tgi-values.yaml b/CodeTrans/kubernetes/helm/gaudi-tgi-values.yaml
new file mode 100644
index 000000000..34f5eb4b7
--- /dev/null
+++ b/CodeTrans/kubernetes/helm/gaudi-tgi-values.yaml
@@ -0,0 +1,33 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  enabled: true
+  accelDevice: "gaudi"
+  image:
+    repository: ghcr.io/huggingface/tgi-gaudi
+    tag: "2.3.1"
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+  MAX_INPUT_LENGTH: "2048"
+  MAX_TOTAL_TOKENS: "4096"
+  CUDA_GRAPHS: ""
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+vllm:
+  enabled: false
+llm-uservice:
+  TEXTGEN_BACKEND: TGI
diff --git a/CodeTrans/kubernetes/helm/gaudi-values.yaml b/CodeTrans/kubernetes/helm/gaudi-values.yaml
index 89ed25928..cac19a63f 100644
--- a/CodeTrans/kubernetes/helm/gaudi-values.yaml
+++ b/CodeTrans/kubernetes/helm/gaudi-values.yaml
@@ -1,32 +1,33 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+# Accelerate inferencing in heaviest components to improve performance
+# by overriding their subchart values
+
 tgi:
+  enabled: false
+
+vllm:
+  enabled: true
   accelDevice: "gaudi"
   image:
-    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.3.1"
+    repository: opea/vllm-gaudi
+  startupProbe:
+    failureThreshold: 360
+
+  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+
   resources:
     limits:
       habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "1024"
-  MAX_TOTAL_TOKENS: "2048"
-  CUDA_GRAPHS: ""
-  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-  ENABLE_HPU_GRAPH: "true"
-  LIMIT_HPU_GRAPH: "true"
-  USE_FLASH_ATTENTION: "true"
-  FLASH_ATTENTION_RECOMPUTE: "true"
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
+  extraCmdArgs: [
+    "--tensor-parallel-size", "1",
+    "--block-size", "128",
+    "--max-num-seqs", "256",
+    "--max-seq_len-to-capture", "2048"
+  ]
+
+llm-uservice:
+  TEXTGEN_BACKEND: vLLM
+  retryTimeoutSeconds: 720
diff --git a/DocSum/kubernetes/helm/cpu-tgi-values.yaml b/DocSum/kubernetes/helm/cpu-tgi-values.yaml
new file mode 100644
index 000000000..9277da305
--- /dev/null
+++ b/DocSum/kubernetes/helm/cpu-tgi-values.yaml
@@ -0,0 +1,9 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+llm-uservice:
+  DOCSUM_BACKEND: "TGI"
+tgi:
+  enabled: true
+vllm:
+  enabled: false
diff --git a/DocSum/kubernetes/helm/cpu-values.yaml b/DocSum/kubernetes/helm/cpu-values.yaml
index 6f2ab7768..677e2a89b 100644
--- a/DocSum/kubernetes/helm/cpu-values.yaml
+++ b/DocSum/kubernetes/helm/cpu-values.yaml
@@ -1,7 +1,9 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+llm-uservice:
+  DOCSUM_BACKEND: "vLLM"
 tgi:
-  enabled: true
-vllm:
   enabled: false
+vllm:
+  enabled: true
diff --git a/DocSum/kubernetes/helm/gaudi-tgi-values.yaml b/DocSum/kubernetes/helm/gaudi-tgi-values.yaml
new file mode 100644
index 000000000..818da3cbd
--- /dev/null
+++ b/DocSum/kubernetes/helm/gaudi-tgi-values.yaml
@@ -0,0 +1,32 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+vllm:
+  enabled: false
+
+llm-uservice:
+  DOCSUM_BACKEND: "TGI"
+
+tgi:
+  enabled: true
+  accelDevice: "gaudi"
+  image:
+    repository: ghcr.io/huggingface/tgi-gaudi
+    tag: "2.3.1"
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+  CUDA_GRAPHS: ""
+  ENABLE_HPU_GRAPH: true
+  LIMIT_HPU_GRAPH: true
+  USE_FLASH_ATTENTION: true
+  FLASH_ATTENTION_RECOMPUTE: true
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
diff --git a/DocSum/kubernetes/helm/gaudi-values.yaml b/DocSum/kubernetes/helm/gaudi-values.yaml
index eda0abe8c..f0ccc0d21 100644
--- a/DocSum/kubernetes/helm/gaudi-values.yaml
+++ b/DocSum/kubernetes/helm/gaudi-values.yaml
@@ -1,36 +1,32 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-vllm:
+# Accelerate inferencing in heaviest components to improve performance
+# by overriding their subchart values
+
+tgi:
   enabled: false
 
 llm-uservice:
-  DOCSUM_BACKEND: "TGI"
+  DOCSUM_BACKEND: "vLLM"
+  retryTimeoutSeconds: 720
 
-tgi:
+vllm:
   enabled: true
-  accelDevice: "gaudi"
   image:
-    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.3.1"
+    repository: opea/vllm-gaudi
   resources:
     limits:
       habana.ai/gaudi: 1
-  CUDA_GRAPHS: ""
-  ENABLE_HPU_GRAPH: true
-  LIMIT_HPU_GRAPH: true
-  USE_FLASH_ATTENTION: true
-  FLASH_ATTENTION_RECOMPUTE: true
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
   startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
+    failureThreshold: 360
+
+  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+
+  extraCmdArgs: [
+    "--tensor-parallel-size", "1",
+    "--block-size", "128",
+    "--max-num-seqs", "256",
+    "--max-seq_len-to-capture", "2048"
+  ]
diff --git a/SearchQnA/kubernetes/helm/gaudi-values.yaml b/SearchQnA/kubernetes/helm/gaudi-values.yaml
index ef327645d..a1abc1a44 100644
--- a/SearchQnA/kubernetes/helm/gaudi-values.yaml
+++ b/SearchQnA/kubernetes/helm/gaudi-values.yaml
@@ -18,10 +18,6 @@ tgi:
   LIMIT_HPU_GRAPH: true
   USE_FLASH_ATTENTION: true
   FLASH_ATTENTION_RECOMPUTE: true
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
   readinessProbe:
     initialDelaySeconds: 5
     periodSeconds: 5
@@ -44,7 +40,5 @@ tei:
   resources:
     limits:
       habana.ai/gaudi: 1
-  livenessProbe:
-    timeoutSeconds: 1
   readinessProbe:
     timeoutSeconds: 1
diff --git a/VisualQnA/kubernetes/helm/cpu-tgi-values.yaml b/VisualQnA/kubernetes/helm/cpu-tgi-values.yaml
new file mode 100644
index 000000000..dac19e816
--- /dev/null
+++ b/VisualQnA/kubernetes/helm/cpu-tgi-values.yaml
@@ -0,0 +1,9 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  enabled: true
+vllm:
+  enabled: false
+lvm-uservice:
+  LVM_BACKEND: "TGI"
diff --git a/VisualQnA/kubernetes/helm/cpu-values.yaml b/VisualQnA/kubernetes/helm/cpu-values.yaml
index acc9d4e28..18a42ccd7 100644
--- a/VisualQnA/kubernetes/helm/cpu-values.yaml
+++ b/VisualQnA/kubernetes/helm/cpu-values.yaml
@@ -1,7 +1,9 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+vllm:
+  enabled: true
 tgi:
-  MAX_INPUT_LENGTH: "4096"
-  MAX_TOTAL_TOKENS: "8192"
-  LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf
+  enabled: false
+lvm-uservice:
+  LVM_BACKEND: "vLLM"
diff --git a/VisualQnA/kubernetes/helm/gaudi-tgi-values.yaml b/VisualQnA/kubernetes/helm/gaudi-tgi-values.yaml
new file mode 100644
index 000000000..8868f8d93
--- /dev/null
+++ b/VisualQnA/kubernetes/helm/gaudi-tgi-values.yaml
@@ -0,0 +1,37 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Accelerate inferencing in heaviest components to improve performance
+# by overriding their subchart values
+
+# TGI: largest bottleneck for VisualQnA
+tgi:
+  enabled: true
+  accelDevice: "gaudi"
+  image:
+    repository: ghcr.io/huggingface/tgi-gaudi
+    tag: "2.3.1"
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+  MAX_INPUT_LENGTH: "4096"
+  MAX_TOTAL_TOKENS: "8192"
+  CUDA_GRAPHS: ""
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+vllm:
+  enabled: false
+lvm-uservice:
+  LVM_BACKEND: "TGI"
diff --git a/VisualQnA/kubernetes/helm/gaudi-values.yaml b/VisualQnA/kubernetes/helm/gaudi-values.yaml
index eb6494a14..bc6d47e9d 100644
--- a/VisualQnA/kubernetes/helm/gaudi-values.yaml
+++ b/VisualQnA/kubernetes/helm/gaudi-values.yaml
@@ -1,36 +1,24 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-# Accelerate inferencing in heaviest components to improve performance
-# by overriding their subchart values
-
-# TGI: largest bottleneck for VisualQnA
-tgi:
+vllm:
+  enabled: true
   accelDevice: "gaudi"
   image:
-    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.3.1"
+    repository: opea/vllm-gaudi
+    tag: "latest"
+  LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
+  VLLM_SKIP_WARMUP: true
+  OMPI_MCA_btl_vader_single_copy_mechanism: none
+  PT_HPUGRAPH_DISABLE_TENSOR_CACHE: "false"
+  extraCmdArgs: ["--tensor-parallel-size","1","--chat-template","examples/template_llava.jinja"]
   resources:
     limits:
       habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "4096"
-  MAX_TOTAL_TOKENS: "8192"
-  CUDA_GRAPHS: ""
-  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-  ENABLE_HPU_GRAPH: "true"
-  LIMIT_HPU_GRAPH: "true"
-  USE_FLASH_ATTENTION: "true"
-  FLASH_ATTENTION_RECOMPUTE: "true"
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
+tgi:
+  enabled: false
+lvm-uservice:
+  LVM_BACKEND: "vLLM"
+  # The default model is not stable on Gaudi, use the older model.
+  # https://github.com/HabanaAI/vllm-fork/issues/841
+  LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf