Sync values yaml file for 1.3 release (#1748)
Signed-off-by: Dolpher Du <dolpher.du@intel.com>
This commit is contained in:
22
AgentQnA/kubernetes/helm/cpu-values.yaml
Normal file
22
AgentQnA/kubernetes/helm/cpu-values.yaml
Normal file
@@ -0,0 +1,22 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
enabled: false
|
||||
vllm:
|
||||
enabled: true
|
||||
LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
extraCmdArgs: ["--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
|
||||
|
||||
supervisor:
|
||||
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||
llm_engine: vllm
|
||||
model: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
ragagent:
|
||||
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||
llm_engine: vllm
|
||||
model: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
sqlagent:
|
||||
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||
llm_engine: vllm
|
||||
model: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
@@ -4,13 +4,32 @@
|
||||
# Accelerate inferencing in heaviest components to improve performance
|
||||
# by overriding their subchart values
|
||||
|
||||
tgi:
|
||||
enabled: false
|
||||
vllm:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: opea/vllm-gaudi
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 4
|
||||
LLM_MODEL_ID: "meta-llama/Llama-3.3-70B-Instruct"
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
PT_HPU_ENABLE_LAZY_COLLECTIVES: true
|
||||
VLLM_SKIP_WARMUP: true
|
||||
shmSize: 16Gi
|
||||
extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
|
||||
|
||||
supervisor:
|
||||
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||
llm_engine: vllm
|
||||
model: "meta-llama/Llama-3.3-70B-Instruct"
|
||||
ragagent:
|
||||
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||
llm_engine: vllm
|
||||
model: "meta-llama/Llama-3.3-70B-Instruct"
|
||||
sqlagent:
|
||||
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||
llm_engine: vllm
|
||||
model: "meta-llama/Llama-3.3-70B-Instruct"
|
||||
|
||||
15
AudioQnA/kubernetes/helm/cpu-multilang-values.yaml
Normal file
15
AudioQnA/kubernetes/helm/cpu-multilang-values.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
enabled: false
|
||||
vllm:
|
||||
enabled: true
|
||||
|
||||
speecht5:
|
||||
enabled: false
|
||||
gpt-sovits:
|
||||
enabled: true
|
||||
|
||||
image:
|
||||
repository: opea/audioqna-multilang
|
||||
12
AudioQnA/kubernetes/helm/cpu-tgi-values.yaml
Normal file
12
AudioQnA/kubernetes/helm/cpu-tgi-values.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
enabled: true
|
||||
vllm:
|
||||
enabled: false
|
||||
|
||||
speecht5:
|
||||
enabled: true
|
||||
gpt-sovits:
|
||||
enabled: false
|
||||
@@ -2,4 +2,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
|
||||
enabled: false
|
||||
vllm:
|
||||
enabled: true
|
||||
|
||||
speecht5:
|
||||
enabled: true
|
||||
gpt-sovits:
|
||||
enabled: false
|
||||
|
||||
49
AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
49
AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
@@ -0,0 +1,49 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.3.1"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
MAX_INPUT_LENGTH: "1024"
|
||||
MAX_TOTAL_TOKENS: "2048"
|
||||
CUDA_GRAPHS: ""
|
||||
HF_HUB_DISABLE_PROGRESS_BARS: 1
|
||||
HF_HUB_ENABLE_HF_TRANSFER: 0
|
||||
ENABLE_HPU_GRAPH: true
|
||||
LIMIT_HPU_GRAPH: true
|
||||
USE_FLASH_ATTENTION: true
|
||||
FLASH_ATTENTION_RECOMPUTE: true
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
vllm:
|
||||
enabled: false
|
||||
|
||||
whisper:
|
||||
image:
|
||||
repository: opea/whisper-gaudi
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
|
||||
speecht5:
|
||||
enabled: true
|
||||
image:
|
||||
repository: opea/speecht5-gaudi
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
gpt-sovits:
|
||||
enabled: false
|
||||
@@ -2,35 +2,27 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
enabled: false
|
||||
vllm:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.3.1"
|
||||
repository: opea/vllm-gaudi
|
||||
startupProbe:
|
||||
failureThreshold: 360
|
||||
|
||||
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
MAX_INPUT_LENGTH: "1024"
|
||||
MAX_TOTAL_TOKENS: "2048"
|
||||
CUDA_GRAPHS: ""
|
||||
HF_HUB_DISABLE_PROGRESS_BARS: 1
|
||||
HF_HUB_ENABLE_HF_TRANSFER: 0
|
||||
ENABLE_HPU_GRAPH: true
|
||||
LIMIT_HPU_GRAPH: true
|
||||
USE_FLASH_ATTENTION: true
|
||||
FLASH_ATTENTION_RECOMPUTE: true
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
extraCmdArgs: [
|
||||
"--tensor-parallel-size", "1",
|
||||
"--block-size", "128",
|
||||
"--max-num-seqs", "256",
|
||||
"--max-seq_len-to-capture", "2048"
|
||||
]
|
||||
|
||||
whisper:
|
||||
image:
|
||||
@@ -40,8 +32,11 @@ whisper:
|
||||
habana.ai/gaudi: 1
|
||||
|
||||
speecht5:
|
||||
enabled: true
|
||||
image:
|
||||
repository: opea/speecht5-gaudi
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
gpt-sovits:
|
||||
enabled: false
|
||||
|
||||
12
ChatQnA/kubernetes/helm/cpu-qdrant-values.yaml
Normal file
12
ChatQnA/kubernetes/helm/cpu-qdrant-values.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
redis-vector-db:
|
||||
enabled: false
|
||||
qdrant:
|
||||
enabled: true
|
||||
|
||||
data-prep:
|
||||
DATAPREP_BACKEND: "QDRANT"
|
||||
retriever-usvc:
|
||||
RETRIEVER_BACKEND: "QDRANT"
|
||||
@@ -44,11 +44,6 @@ tgi:
|
||||
# cpu: 6
|
||||
# memory: 65Gi
|
||||
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 8
|
||||
periodSeconds: 8
|
||||
failureThreshold: 24
|
||||
timeoutSeconds: 4
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 16
|
||||
periodSeconds: 8
|
||||
@@ -65,17 +60,12 @@ teirerank:
|
||||
# Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
|
||||
resources:
|
||||
limits:
|
||||
cpu: 4
|
||||
cpu: 24
|
||||
memory: 30Gi
|
||||
requests:
|
||||
cpu: 2
|
||||
memory: 25Gi
|
||||
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 8
|
||||
periodSeconds: 8
|
||||
failureThreshold: 24
|
||||
timeoutSeconds: 4
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 8
|
||||
periodSeconds: 8
|
||||
@@ -91,17 +81,12 @@ tei:
|
||||
# Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
|
||||
resources:
|
||||
limits:
|
||||
cpu: 4
|
||||
cpu: 24
|
||||
memory: 4Gi
|
||||
requests:
|
||||
cpu: 2
|
||||
memory: 3Gi
|
||||
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
failureThreshold: 24
|
||||
timeoutSeconds: 2
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
|
||||
16
ChatQnA/kubernetes/helm/faqgen-cpu-tgi-values.yaml
Normal file
16
ChatQnA/kubernetes/helm/faqgen-cpu-tgi-values.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
CHATQNA_TYPE: "CHATQNA_FAQGEN"
|
||||
llm-uservice:
|
||||
enabled: true
|
||||
image:
|
||||
repository: opea/llm-faqgen
|
||||
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
FAQGEN_BACKEND: "TGI"
|
||||
service:
|
||||
port: 80
|
||||
vllm:
|
||||
enabled: false
|
||||
tgi:
|
||||
enabled: true
|
||||
12
ChatQnA/kubernetes/helm/faqgen-cpu-values.yaml
Normal file
12
ChatQnA/kubernetes/helm/faqgen-cpu-values.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
CHATQNA_TYPE: "CHATQNA_FAQGEN"
|
||||
llm-uservice:
|
||||
enabled: true
|
||||
image:
|
||||
repository: opea/llm-faqgen
|
||||
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
FAQGEN_BACKEND: "vLLM"
|
||||
service:
|
||||
port: 80
|
||||
60
ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml
Normal file
60
ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml
Normal file
@@ -0,0 +1,60 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
CHATQNA_TYPE: "CHATQNA_FAQGEN"
|
||||
llm-uservice:
|
||||
enabled: true
|
||||
image:
|
||||
repository: opea/llm-faqgen
|
||||
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
FAQGEN_BACKEND: "TGI"
|
||||
service:
|
||||
port: 80
|
||||
vllm:
|
||||
enabled: false
|
||||
# TGI: largest bottleneck for ChatQnA
|
||||
tgi:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.3.1"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
# higher limits are needed with extra input tokens added by rerank
|
||||
MAX_INPUT_LENGTH: "2048"
|
||||
MAX_TOTAL_TOKENS: "4096"
|
||||
CUDA_GRAPHS: ""
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
ENABLE_HPU_GRAPH: "true"
|
||||
LIMIT_HPU_GRAPH: "true"
|
||||
USE_FLASH_ATTENTION: "true"
|
||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
|
||||
# Reranking: second largest bottleneck when reranking is in use
|
||||
# (i.e. query context docs have been uploaded with data-prep)
|
||||
teirerank:
|
||||
accelDevice: "gaudi"
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
MAX_WARMUP_SEQUENCE_LENGTH: "512"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tei-gaudi
|
||||
tag: 1.5.0
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
# securityContext:
|
||||
# readOnlyRootFilesystem: false
|
||||
readinessProbe:
|
||||
timeoutSeconds: 1
|
||||
53
ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml
Normal file
53
ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml
Normal file
@@ -0,0 +1,53 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
CHATQNA_TYPE: "CHATQNA_FAQGEN"
|
||||
llm-uservice:
|
||||
enabled: true
|
||||
image:
|
||||
repository: opea/llm-faqgen
|
||||
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
FAQGEN_BACKEND: "vLLM"
|
||||
service:
|
||||
port: 80
|
||||
tgi:
|
||||
enabled: false
|
||||
vllm:
|
||||
enabled: true
|
||||
shmSize: 1Gi
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: opea/vllm-gaudi
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
|
||||
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
VLLM_SKIP_WARMUP: true
|
||||
|
||||
extraCmdArgs: [
|
||||
"--tensor-parallel-size", "1",
|
||||
"--block-size", "128",
|
||||
"--max-num-seqs", "256",
|
||||
"--max-seq_len-to-capture", "2048"
|
||||
]
|
||||
|
||||
# Reranking: second largest bottleneck when reranking is in use
|
||||
# (i.e. query context docs have been uploaded with data-prep)
|
||||
#
|
||||
# TODO: could vLLM be used also for reranking / embedding?
|
||||
teirerank:
|
||||
accelDevice: "gaudi"
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
MAX_WARMUP_SEQUENCE_LENGTH: "512"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tei-gaudi
|
||||
tag: 1.5.0
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
# securityContext:
|
||||
# readOnlyRootFilesystem: false
|
||||
readinessProbe:
|
||||
timeoutSeconds: 1
|
||||
@@ -26,10 +26,6 @@ tgi:
|
||||
USE_FLASH_ATTENTION: "true"
|
||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
@@ -54,8 +50,6 @@ teirerank:
|
||||
habana.ai/gaudi: 1
|
||||
securityContext:
|
||||
readOnlyRootFilesystem: false
|
||||
livenessProbe:
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
timeoutSeconds: 1
|
||||
|
||||
@@ -73,7 +67,3 @@ teirerank:
|
||||
# habana.ai/gaudi: 1
|
||||
# securityContext:
|
||||
# readOnlyRootFilesystem: false
|
||||
# livenessProbe:
|
||||
# timeoutSeconds: 1
|
||||
# readinessProbe:
|
||||
# timeoutSeconds: 1
|
||||
|
||||
@@ -16,18 +16,7 @@ vllm:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 180
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 360
|
||||
|
||||
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
@@ -55,7 +44,5 @@ teirerank:
|
||||
habana.ai/gaudi: 1
|
||||
securityContext:
|
||||
readOnlyRootFilesystem: false
|
||||
livenessProbe:
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
timeoutSeconds: 1
|
||||
@@ -1,9 +1,12 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
image:
|
||||
repository: opea/chatqna-guardrails
|
||||
# Accelerate inferencing in heaviest components to improve performance
|
||||
# by overriding their subchart values
|
||||
|
||||
image:
|
||||
repository: opea/chatqna
|
||||
CHATQNA_TYPE: "CHATQNA_GUARDRAILS"
|
||||
# guardrails related config
|
||||
guardrails-usvc:
|
||||
enabled: true
|
||||
@@ -22,10 +25,6 @@ guardrails-usvc:
|
||||
# habana.ai/gaudi: 1
|
||||
# securityContext:
|
||||
# readOnlyRootFilesystem: false
|
||||
# livenessProbe:
|
||||
# timeoutSeconds: 1
|
||||
# readinessProbe:
|
||||
# timeoutSeconds: 1
|
||||
|
||||
teirerank:
|
||||
accelDevice: "gaudi"
|
||||
@@ -39,8 +38,6 @@ teirerank:
|
||||
habana.ai/gaudi: 1
|
||||
securityContext:
|
||||
readOnlyRootFilesystem: false
|
||||
livenessProbe:
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
timeoutSeconds: 1
|
||||
|
||||
@@ -62,10 +59,6 @@ tgi-guardrails:
|
||||
LIMIT_HPU_GRAPH: "true"
|
||||
USE_FLASH_ATTENTION: "true"
|
||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
@@ -88,18 +81,7 @@ vllm:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 180
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 360
|
||||
|
||||
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
|
||||
@@ -1,11 +1,7 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Accelerate inferencing in heaviest components to improve performance
|
||||
# by overriding their subchart values
|
||||
|
||||
image:
|
||||
repository: opea/chatqna-without-rerank
|
||||
CHATQNA_TYPE: "CHATQNA_NO_RERANK"
|
||||
|
||||
teirerank:
|
||||
enabled: false
|
||||
|
||||
9
CodeGen/kubernetes/helm/cpu-tgi-values.yaml
Normal file
9
CodeGen/kubernetes/helm/cpu-tgi-values.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
enabled: true
|
||||
vllm:
|
||||
enabled: false
|
||||
llm-uservice:
|
||||
TEXTGEN_BACKEND: TGI
|
||||
@@ -2,4 +2,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
|
||||
enabled: false
|
||||
vllm:
|
||||
enabled: true
|
||||
llm-uservice:
|
||||
TEXTGEN_BACKEND: vLLM
|
||||
|
||||
33
CodeGen/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
33
CodeGen/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
@@ -0,0 +1,33 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.3.1"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
MAX_INPUT_LENGTH: "1024"
|
||||
MAX_TOTAL_TOKENS: "2048"
|
||||
CUDA_GRAPHS: ""
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
ENABLE_HPU_GRAPH: "true"
|
||||
LIMIT_HPU_GRAPH: "true"
|
||||
USE_FLASH_ATTENTION: "true"
|
||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
vllm:
|
||||
enabled: false
|
||||
llm-uservice:
|
||||
TEXTGEN_BACKEND: TGI
|
||||
@@ -2,32 +2,26 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
enabled: false
|
||||
|
||||
vllm:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.3.1"
|
||||
repository: opea/vllm-gaudi
|
||||
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
startupProbe:
|
||||
failureThreshold: 360
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
MAX_INPUT_LENGTH: "1024"
|
||||
MAX_TOTAL_TOKENS: "2048"
|
||||
CUDA_GRAPHS: ""
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
ENABLE_HPU_GRAPH: "true"
|
||||
LIMIT_HPU_GRAPH: "true"
|
||||
USE_FLASH_ATTENTION: "true"
|
||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
extraCmdArgs: [
|
||||
"--tensor-parallel-size", "1",
|
||||
"--block-size", "128",
|
||||
"--max-num-seqs", "256",
|
||||
]
|
||||
|
||||
llm-uservice:
|
||||
TEXTGEN_BACKEND: vLLM
|
||||
retryTimeoutSeconds: 720
|
||||
|
||||
9
CodeTrans/kubernetes/helm/cpu-tgi-values.yaml
Normal file
9
CodeTrans/kubernetes/helm/cpu-tgi-values.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
enabled: true
|
||||
vllm:
|
||||
enabled: false
|
||||
llm-uservice:
|
||||
TEXTGEN_BACKEND: TGI
|
||||
@@ -2,4 +2,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3
|
||||
enabled: false
|
||||
vllm:
|
||||
enabled: true
|
||||
llm-uservice:
|
||||
TEXTGEN_BACKEND: vLLM
|
||||
|
||||
33
CodeTrans/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
33
CodeTrans/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
@@ -0,0 +1,33 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.3.1"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
MAX_INPUT_LENGTH: "2048"
|
||||
MAX_TOTAL_TOKENS: "4096"
|
||||
CUDA_GRAPHS: ""
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
ENABLE_HPU_GRAPH: "true"
|
||||
LIMIT_HPU_GRAPH: "true"
|
||||
USE_FLASH_ATTENTION: "true"
|
||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
vllm:
|
||||
enabled: false
|
||||
llm-uservice:
|
||||
TEXTGEN_BACKEND: TGI
|
||||
@@ -1,32 +1,33 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Accelerate inferencing in heaviest components to improve performance
|
||||
# by overriding their subchart values
|
||||
|
||||
tgi:
|
||||
enabled: false
|
||||
|
||||
vllm:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.3.1"
|
||||
repository: opea/vllm-gaudi
|
||||
startupProbe:
|
||||
failureThreshold: 360
|
||||
|
||||
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
MAX_INPUT_LENGTH: "1024"
|
||||
MAX_TOTAL_TOKENS: "2048"
|
||||
CUDA_GRAPHS: ""
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
ENABLE_HPU_GRAPH: "true"
|
||||
LIMIT_HPU_GRAPH: "true"
|
||||
USE_FLASH_ATTENTION: "true"
|
||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
extraCmdArgs: [
|
||||
"--tensor-parallel-size", "1",
|
||||
"--block-size", "128",
|
||||
"--max-num-seqs", "256",
|
||||
"--max-seq_len-to-capture", "2048"
|
||||
]
|
||||
|
||||
llm-uservice:
|
||||
TEXTGEN_BACKEND: vLLM
|
||||
retryTimeoutSeconds: 720
|
||||
|
||||
9
DocSum/kubernetes/helm/cpu-tgi-values.yaml
Normal file
9
DocSum/kubernetes/helm/cpu-tgi-values.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
llm-uservice:
|
||||
DOCSUM_BACKEND: "TGI"
|
||||
tgi:
|
||||
enabled: true
|
||||
vllm:
|
||||
enabled: false
|
||||
@@ -1,7 +1,9 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
llm-uservice:
|
||||
DOCSUM_BACKEND: "vLLM"
|
||||
tgi:
|
||||
enabled: true
|
||||
vllm:
|
||||
enabled: false
|
||||
vllm:
|
||||
enabled: true
|
||||
|
||||
32
DocSum/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
32
DocSum/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
@@ -0,0 +1,32 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
vllm:
|
||||
enabled: false
|
||||
|
||||
llm-uservice:
|
||||
DOCSUM_BACKEND: "TGI"
|
||||
|
||||
tgi:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.3.1"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
CUDA_GRAPHS: ""
|
||||
ENABLE_HPU_GRAPH: true
|
||||
LIMIT_HPU_GRAPH: true
|
||||
USE_FLASH_ATTENTION: true
|
||||
FLASH_ATTENTION_RECOMPUTE: true
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
@@ -1,36 +1,32 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
vllm:
|
||||
# Accelerate inferencing in heaviest components to improve performance
|
||||
# by overriding their subchart values
|
||||
|
||||
tgi:
|
||||
enabled: false
|
||||
|
||||
llm-uservice:
|
||||
DOCSUM_BACKEND: "TGI"
|
||||
DOCSUM_BACKEND: "vLLM"
|
||||
retryTimeoutSeconds: 720
|
||||
|
||||
tgi:
|
||||
vllm:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.3.1"
|
||||
repository: opea/vllm-gaudi
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
CUDA_GRAPHS: ""
|
||||
ENABLE_HPU_GRAPH: true
|
||||
LIMIT_HPU_GRAPH: true
|
||||
USE_FLASH_ATTENTION: true
|
||||
FLASH_ATTENTION_RECOMPUTE: true
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
failureThreshold: 360
|
||||
|
||||
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
|
||||
extraCmdArgs: [
|
||||
"--tensor-parallel-size", "1",
|
||||
"--block-size", "128",
|
||||
"--max-num-seqs", "256",
|
||||
"--max-seq_len-to-capture", "2048"
|
||||
]
|
||||
|
||||
@@ -18,10 +18,6 @@ tgi:
|
||||
LIMIT_HPU_GRAPH: true
|
||||
USE_FLASH_ATTENTION: true
|
||||
FLASH_ATTENTION_RECOMPUTE: true
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
@@ -44,7 +40,5 @@ tei:
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
livenessProbe:
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
timeoutSeconds: 1
|
||||
|
||||
9
VisualQnA/kubernetes/helm/cpu-tgi-values.yaml
Normal file
9
VisualQnA/kubernetes/helm/cpu-tgi-values.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
tgi:
|
||||
enabled: true
|
||||
vllm:
|
||||
enabled: false
|
||||
lvm-uservice:
|
||||
LVM_BACKEND: "TGI"
|
||||
@@ -1,7 +1,9 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
vllm:
|
||||
enabled: true
|
||||
tgi:
|
||||
MAX_INPUT_LENGTH: "4096"
|
||||
MAX_TOTAL_TOKENS: "8192"
|
||||
LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf
|
||||
enabled: false
|
||||
lvm-uservice:
|
||||
LVM_BACKEND: "vLLM"
|
||||
|
||||
37
VisualQnA/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
37
VisualQnA/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
@@ -0,0 +1,37 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Accelerate inferencing in heaviest components to improve performance
|
||||
# by overriding their subchart values
|
||||
|
||||
# TGI: largest bottleneck for VisualQnA
|
||||
tgi:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.3.1"
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
MAX_INPUT_LENGTH: "4096"
|
||||
MAX_TOTAL_TOKENS: "8192"
|
||||
CUDA_GRAPHS: ""
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
ENABLE_HPU_GRAPH: "true"
|
||||
LIMIT_HPU_GRAPH: "true"
|
||||
USE_FLASH_ATTENTION: "true"
|
||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
vllm:
|
||||
enabled: false
|
||||
lvm-uservice:
|
||||
LVM_BACKEND: "TGI"
|
||||
@@ -1,36 +1,24 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Accelerate inferencing in heaviest components to improve performance
|
||||
# by overriding their subchart values
|
||||
|
||||
# TGI: largest bottleneck for VisualQnA
|
||||
tgi:
|
||||
vllm:
|
||||
enabled: true
|
||||
accelDevice: "gaudi"
|
||||
image:
|
||||
repository: ghcr.io/huggingface/tgi-gaudi
|
||||
tag: "2.3.1"
|
||||
repository: opea/vllm-gaudi
|
||||
tag: "latest"
|
||||
LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
|
||||
VLLM_SKIP_WARMUP: true
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
PT_HPUGRAPH_DISABLE_TENSOR_CACHE: "false"
|
||||
extraCmdArgs: ["--tensor-parallel-size","1","--chat-template","examples/template_llava.jinja"]
|
||||
resources:
|
||||
limits:
|
||||
habana.ai/gaudi: 1
|
||||
MAX_INPUT_LENGTH: "4096"
|
||||
MAX_TOTAL_TOKENS: "8192"
|
||||
CUDA_GRAPHS: ""
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||
ENABLE_HPU_GRAPH: "true"
|
||||
LIMIT_HPU_GRAPH: "true"
|
||||
USE_FLASH_ATTENTION: "true"
|
||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
startupProbe:
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 1
|
||||
failureThreshold: 120
|
||||
tgi:
|
||||
enabled: false
|
||||
lvm-uservice:
|
||||
LVM_BACKEND: "vLLM"
|
||||
# The default model is not stable on Gaudi, use the older model.
|
||||
# https://github.com/HabanaAI/vllm-fork/issues/841
|
||||
LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
|
||||
|
||||
Reference in New Issue
Block a user