Sync values yaml file for 1.3 release (#1748)

Signed-off-by: Dolpher Du <dolpher.du@intel.com>
This commit is contained in:
dolpher
2025-04-08 22:39:40 +08:00
committed by GitHub
parent b14db6dbd3
commit 46ebb78aa3
34 changed files with 580 additions and 212 deletions

View File

@@ -0,0 +1,22 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: false
vllm:
enabled: true
LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
extraCmdArgs: ["--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
supervisor:
llm_endpoint_url: http://{{ .Release.Name }}-vllm
llm_engine: vllm
model: "meta-llama/Meta-Llama-3-8B-Instruct"
ragagent:
llm_endpoint_url: http://{{ .Release.Name }}-vllm
llm_engine: vllm
model: "meta-llama/Meta-Llama-3-8B-Instruct"
sqlagent:
llm_endpoint_url: http://{{ .Release.Name }}-vllm
llm_engine: vllm
model: "meta-llama/Meta-Llama-3-8B-Instruct"

View File

@@ -4,13 +4,32 @@
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values
tgi:
enabled: false
vllm:
enabled: true
accelDevice: "gaudi"
image:
repository: opea/vllm-gaudi
resources:
limits:
habana.ai/gaudi: 4
LLM_MODEL_ID: "meta-llama/Llama-3.3-70B-Instruct"
OMPI_MCA_btl_vader_single_copy_mechanism: none
PT_HPU_ENABLE_LAZY_COLLECTIVES: true
VLLM_SKIP_WARMUP: true
shmSize: 16Gi
extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
supervisor:
llm_endpoint_url: http://{{ .Release.Name }}-vllm
llm_engine: vllm
model: "meta-llama/Llama-3.3-70B-Instruct"
ragagent:
llm_endpoint_url: http://{{ .Release.Name }}-vllm
llm_engine: vllm
model: "meta-llama/Llama-3.3-70B-Instruct"
sqlagent:
llm_endpoint_url: http://{{ .Release.Name }}-vllm
llm_engine: vllm
model: "meta-llama/Llama-3.3-70B-Instruct"

View File

@@ -0,0 +1,15 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: false
vllm:
enabled: true
speecht5:
enabled: false
gpt-sovits:
enabled: true
image:
repository: opea/audioqna-multilang

View File

@@ -0,0 +1,12 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
vllm:
enabled: false
speecht5:
enabled: true
gpt-sovits:
enabled: false

View File

@@ -2,4 +2,11 @@
# SPDX-License-Identifier: Apache-2.0
tgi:
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
enabled: false
vllm:
enabled: true
speecht5:
enabled: true
gpt-sovits:
enabled: false

View File

@@ -0,0 +1,49 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
vllm:
enabled: false
whisper:
image:
repository: opea/whisper-gaudi
resources:
limits:
habana.ai/gaudi: 1
speecht5:
enabled: true
image:
repository: opea/speecht5-gaudi
resources:
limits:
habana.ai/gaudi: 1
gpt-sovits:
enabled: false

View File

@@ -2,35 +2,27 @@
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: false
vllm:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
repository: opea/vllm-gaudi
startupProbe:
failureThreshold: 360
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
extraCmdArgs: [
"--tensor-parallel-size", "1",
"--block-size", "128",
"--max-num-seqs", "256",
"--max-seq_len-to-capture", "2048"
]
whisper:
image:
@@ -40,8 +32,11 @@ whisper:
habana.ai/gaudi: 1
speecht5:
enabled: true
image:
repository: opea/speecht5-gaudi
resources:
limits:
habana.ai/gaudi: 1
gpt-sovits:
enabled: false

View File

@@ -0,0 +1,12 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
redis-vector-db:
enabled: false
qdrant:
enabled: true
data-prep:
DATAPREP_BACKEND: "QDRANT"
retriever-usvc:
RETRIEVER_BACKEND: "QDRANT"

View File

@@ -44,11 +44,6 @@ tgi:
# cpu: 6
# memory: 65Gi
livenessProbe:
initialDelaySeconds: 8
periodSeconds: 8
failureThreshold: 24
timeoutSeconds: 4
readinessProbe:
initialDelaySeconds: 16
periodSeconds: 8
@@ -65,17 +60,12 @@ teirerank:
# Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
resources:
limits:
cpu: 4
cpu: 24
memory: 30Gi
requests:
cpu: 2
memory: 25Gi
livenessProbe:
initialDelaySeconds: 8
periodSeconds: 8
failureThreshold: 24
timeoutSeconds: 4
readinessProbe:
initialDelaySeconds: 8
periodSeconds: 8
@@ -91,17 +81,12 @@ tei:
# Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
resources:
limits:
cpu: 4
cpu: 24
memory: 4Gi
requests:
cpu: 2
memory: 3Gi
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 24
timeoutSeconds: 2
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5

View File

@@ -0,0 +1,16 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
CHATQNA_TYPE: "CHATQNA_FAQGEN"
llm-uservice:
enabled: true
image:
repository: opea/llm-faqgen
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
FAQGEN_BACKEND: "TGI"
service:
port: 80
vllm:
enabled: false
tgi:
enabled: true

View File

@@ -0,0 +1,12 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
CHATQNA_TYPE: "CHATQNA_FAQGEN"
llm-uservice:
enabled: true
image:
repository: opea/llm-faqgen
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
FAQGEN_BACKEND: "vLLM"
service:
port: 80

View File

@@ -0,0 +1,60 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
CHATQNA_TYPE: "CHATQNA_FAQGEN"
llm-uservice:
enabled: true
image:
repository: opea/llm-faqgen
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
FAQGEN_BACKEND: "TGI"
service:
port: 80
vllm:
enabled: false
# TGI: largest bottleneck for ChatQnA
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
# higher limits are needed with extra input tokens added by rerank
MAX_INPUT_LENGTH: "2048"
MAX_TOTAL_TOKENS: "4096"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
# Reranking: second largest bottleneck when reranking is in use
# (i.e. query context docs have been uploaded with data-prep)
teirerank:
accelDevice: "gaudi"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
MAX_WARMUP_SEQUENCE_LENGTH: "512"
image:
repository: ghcr.io/huggingface/tei-gaudi
tag: 1.5.0
resources:
limits:
habana.ai/gaudi: 1
# securityContext:
# readOnlyRootFilesystem: false
readinessProbe:
timeoutSeconds: 1

View File

@@ -0,0 +1,53 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
CHATQNA_TYPE: "CHATQNA_FAQGEN"
llm-uservice:
enabled: true
image:
repository: opea/llm-faqgen
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
FAQGEN_BACKEND: "vLLM"
service:
port: 80
tgi:
enabled: false
vllm:
enabled: true
shmSize: 1Gi
accelDevice: "gaudi"
image:
repository: opea/vllm-gaudi
resources:
limits:
habana.ai/gaudi: 1
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
VLLM_SKIP_WARMUP: true
extraCmdArgs: [
"--tensor-parallel-size", "1",
"--block-size", "128",
"--max-num-seqs", "256",
"--max-seq_len-to-capture", "2048"
]
# Reranking: second largest bottleneck when reranking is in use
# (i.e. query context docs have been uploaded with data-prep)
#
# TODO: could vLLM be used also for reranking / embedding?
teirerank:
accelDevice: "gaudi"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
MAX_WARMUP_SEQUENCE_LENGTH: "512"
image:
repository: ghcr.io/huggingface/tei-gaudi
tag: 1.5.0
resources:
limits:
habana.ai/gaudi: 1
# securityContext:
# readOnlyRootFilesystem: false
readinessProbe:
timeoutSeconds: 1

View File

@@ -26,10 +26,6 @@ tgi:
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
@@ -54,8 +50,6 @@ teirerank:
habana.ai/gaudi: 1
securityContext:
readOnlyRootFilesystem: false
livenessProbe:
timeoutSeconds: 1
readinessProbe:
timeoutSeconds: 1
@@ -73,7 +67,3 @@ teirerank:
# habana.ai/gaudi: 1
# securityContext:
# readOnlyRootFilesystem: false
# livenessProbe:
# timeoutSeconds: 1
# readinessProbe:
# timeoutSeconds: 1

View File

@@ -16,18 +16,7 @@ vllm:
limits:
habana.ai/gaudi: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 180
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 360
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
@@ -55,7 +44,5 @@ teirerank:
habana.ai/gaudi: 1
securityContext:
readOnlyRootFilesystem: false
livenessProbe:
timeoutSeconds: 1
readinessProbe:
timeoutSeconds: 1

View File

@@ -1,9 +1,12 @@
# Copyright (C) 2024 Intel Corporation
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
image:
repository: opea/chatqna-guardrails
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values
image:
repository: opea/chatqna
CHATQNA_TYPE: "CHATQNA_GUARDRAILS"
# guardrails related config
guardrails-usvc:
enabled: true
@@ -22,10 +25,6 @@ guardrails-usvc:
# habana.ai/gaudi: 1
# securityContext:
# readOnlyRootFilesystem: false
# livenessProbe:
# timeoutSeconds: 1
# readinessProbe:
# timeoutSeconds: 1
teirerank:
accelDevice: "gaudi"
@@ -39,8 +38,6 @@ teirerank:
habana.ai/gaudi: 1
securityContext:
readOnlyRootFilesystem: false
livenessProbe:
timeoutSeconds: 1
readinessProbe:
timeoutSeconds: 1
@@ -62,10 +59,6 @@ tgi-guardrails:
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
@@ -88,18 +81,7 @@ vllm:
limits:
habana.ai/gaudi: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 180
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 360
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"

View File

@@ -1,11 +1,7 @@
# Copyright (C) 2024 Intel Corporation
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values
image:
repository: opea/chatqna-without-rerank
CHATQNA_TYPE: "CHATQNA_NO_RERANK"
teirerank:
enabled: false

View File

@@ -0,0 +1,9 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
vllm:
enabled: false
llm-uservice:
TEXTGEN_BACKEND: TGI

View File

@@ -2,4 +2,8 @@
# SPDX-License-Identifier: Apache-2.0
tgi:
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
enabled: false
vllm:
enabled: true
llm-uservice:
TEXTGEN_BACKEND: vLLM

View File

@@ -0,0 +1,33 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
vllm:
enabled: false
llm-uservice:
TEXTGEN_BACKEND: TGI

View File

@@ -2,32 +2,26 @@
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: false
vllm:
enabled: true
accelDevice: "gaudi"
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
repository: opea/vllm-gaudi
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
startupProbe:
failureThreshold: 360
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
extraCmdArgs: [
"--tensor-parallel-size", "1",
"--block-size", "128",
"--max-num-seqs", "256",
]
llm-uservice:
TEXTGEN_BACKEND: vLLM
retryTimeoutSeconds: 720

View File

@@ -0,0 +1,9 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
vllm:
enabled: false
llm-uservice:
TEXTGEN_BACKEND: TGI

View File

@@ -2,4 +2,8 @@
# SPDX-License-Identifier: Apache-2.0
tgi:
LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3
enabled: false
vllm:
enabled: true
llm-uservice:
TEXTGEN_BACKEND: vLLM

View File

@@ -0,0 +1,33 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "2048"
MAX_TOTAL_TOKENS: "4096"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
vllm:
enabled: false
llm-uservice:
TEXTGEN_BACKEND: TGI

View File

@@ -1,32 +1,33 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values
tgi:
enabled: false
vllm:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
repository: opea/vllm-gaudi
startupProbe:
failureThreshold: 360
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
extraCmdArgs: [
"--tensor-parallel-size", "1",
"--block-size", "128",
"--max-num-seqs", "256",
"--max-seq_len-to-capture", "2048"
]
llm-uservice:
TEXTGEN_BACKEND: vLLM
retryTimeoutSeconds: 720

View File

@@ -0,0 +1,9 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
llm-uservice:
DOCSUM_BACKEND: "TGI"
tgi:
enabled: true
vllm:
enabled: false

View File

@@ -1,7 +1,9 @@
# Copyright (C) 2024 Intel Corporation
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
llm-uservice:
DOCSUM_BACKEND: "vLLM"
tgi:
enabled: true
vllm:
enabled: false
vllm:
enabled: true

View File

@@ -0,0 +1,32 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
vllm:
enabled: false
llm-uservice:
DOCSUM_BACKEND: "TGI"
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
CUDA_GRAPHS: ""
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120

View File

@@ -1,36 +1,32 @@
# Copyright (C) 2024 Intel Corporation
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
vllm:
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values
tgi:
enabled: false
llm-uservice:
DOCSUM_BACKEND: "TGI"
DOCSUM_BACKEND: "vLLM"
retryTimeoutSeconds: 720
tgi:
vllm:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
repository: opea/vllm-gaudi
resources:
limits:
habana.ai/gaudi: 1
CUDA_GRAPHS: ""
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
failureThreshold: 360
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
extraCmdArgs: [
"--tensor-parallel-size", "1",
"--block-size", "128",
"--max-num-seqs", "256",
"--max-seq_len-to-capture", "2048"
]

View File

@@ -18,10 +18,6 @@ tgi:
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
@@ -44,7 +40,5 @@ tei:
resources:
limits:
habana.ai/gaudi: 1
livenessProbe:
timeoutSeconds: 1
readinessProbe:
timeoutSeconds: 1

View File

@@ -0,0 +1,9 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
vllm:
enabled: false
lvm-uservice:
LVM_BACKEND: "TGI"

View File

@@ -1,7 +1,9 @@
# Copyright (C) 2024 Intel Corporation
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
vllm:
enabled: true
tgi:
MAX_INPUT_LENGTH: "4096"
MAX_TOTAL_TOKENS: "8192"
LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf
enabled: false
lvm-uservice:
LVM_BACKEND: "vLLM"

View File

@@ -0,0 +1,37 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values
# TGI: largest bottleneck for VisualQnA
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "4096"
MAX_TOTAL_TOKENS: "8192"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
vllm:
enabled: false
lvm-uservice:
LVM_BACKEND: "TGI"

View File

@@ -1,36 +1,24 @@
# Copyright (C) 2024 Intel Corporation
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values
# TGI: largest bottleneck for VisualQnA
tgi:
vllm:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
repository: opea/vllm-gaudi
tag: "latest"
LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
VLLM_SKIP_WARMUP: true
OMPI_MCA_btl_vader_single_copy_mechanism: none
PT_HPUGRAPH_DISABLE_TENSOR_CACHE: "false"
extraCmdArgs: ["--tensor-parallel-size","1","--chat-template","examples/template_llava.jinja"]
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "4096"
MAX_TOTAL_TOKENS: "8192"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
tgi:
enabled: false
lvm-uservice:
LVM_BACKEND: "vLLM"
# The default model is not stable on Gaudi, use the older model.
# https://github.com/HabanaAI/vllm-fork/issues/841
LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf