Sync values yaml file for 1.3 release (#1748)

Signed-off-by: Dolpher Du <dolpher.du@intel.com>
This commit is contained in:
dolpher
2025-04-08 22:39:40 +08:00
committed by GitHub
parent b14db6dbd3
commit 46ebb78aa3
34 changed files with 580 additions and 212 deletions

View File

@@ -0,0 +1,22 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: false
vllm:
enabled: true
LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
extraCmdArgs: ["--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
supervisor:
llm_endpoint_url: http://{{ .Release.Name }}-vllm
llm_engine: vllm
model: "meta-llama/Meta-Llama-3-8B-Instruct"
ragagent:
llm_endpoint_url: http://{{ .Release.Name }}-vllm
llm_engine: vllm
model: "meta-llama/Meta-Llama-3-8B-Instruct"
sqlagent:
llm_endpoint_url: http://{{ .Release.Name }}-vllm
llm_engine: vllm
model: "meta-llama/Meta-Llama-3-8B-Instruct"

View File

@@ -4,13 +4,32 @@
# Accelerate inferencing in heaviest components to improve performance # Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values # by overriding their subchart values
tgi:
enabled: false
vllm: vllm:
enabled: true enabled: true
accelDevice: "gaudi"
image: image:
repository: opea/vllm-gaudi repository: opea/vllm-gaudi
resources:
limits:
habana.ai/gaudi: 4
LLM_MODEL_ID: "meta-llama/Llama-3.3-70B-Instruct"
OMPI_MCA_btl_vader_single_copy_mechanism: none
PT_HPU_ENABLE_LAZY_COLLECTIVES: true
VLLM_SKIP_WARMUP: true
shmSize: 16Gi
extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
supervisor: supervisor:
llm_endpoint_url: http://{{ .Release.Name }}-vllm llm_endpoint_url: http://{{ .Release.Name }}-vllm
llm_engine: vllm
model: "meta-llama/Llama-3.3-70B-Instruct"
ragagent: ragagent:
llm_endpoint_url: http://{{ .Release.Name }}-vllm llm_endpoint_url: http://{{ .Release.Name }}-vllm
llm_engine: vllm
model: "meta-llama/Llama-3.3-70B-Instruct"
sqlagent: sqlagent:
llm_endpoint_url: http://{{ .Release.Name }}-vllm llm_endpoint_url: http://{{ .Release.Name }}-vllm
llm_engine: vllm
model: "meta-llama/Llama-3.3-70B-Instruct"

View File

@@ -0,0 +1,15 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: false
vllm:
enabled: true
speecht5:
enabled: false
gpt-sovits:
enabled: true
image:
repository: opea/audioqna-multilang

View File

@@ -0,0 +1,12 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
vllm:
enabled: false
speecht5:
enabled: true
gpt-sovits:
enabled: false

View File

@@ -2,4 +2,11 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
tgi: tgi:
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 enabled: false
vllm:
enabled: true
speecht5:
enabled: true
gpt-sovits:
enabled: false

View File

@@ -0,0 +1,49 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
vllm:
enabled: false
whisper:
image:
repository: opea/whisper-gaudi
resources:
limits:
habana.ai/gaudi: 1
speecht5:
enabled: true
image:
repository: opea/speecht5-gaudi
resources:
limits:
habana.ai/gaudi: 1
gpt-sovits:
enabled: false

View File

@@ -2,35 +2,27 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
tgi: tgi:
enabled: false
vllm:
enabled: true
accelDevice: "gaudi" accelDevice: "gaudi"
image: image:
repository: ghcr.io/huggingface/tgi-gaudi repository: opea/vllm-gaudi
tag: "2.3.1" startupProbe:
failureThreshold: 360
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
resources: resources:
limits: limits:
habana.ai/gaudi: 1 habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024" extraCmdArgs: [
MAX_TOTAL_TOKENS: "2048" "--tensor-parallel-size", "1",
CUDA_GRAPHS: "" "--block-size", "128",
HF_HUB_DISABLE_PROGRESS_BARS: 1 "--max-num-seqs", "256",
HF_HUB_ENABLE_HF_TRANSFER: 0 "--max-seq_len-to-capture", "2048"
ENABLE_HPU_GRAPH: true ]
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
whisper: whisper:
image: image:
@@ -40,8 +32,11 @@ whisper:
habana.ai/gaudi: 1 habana.ai/gaudi: 1
speecht5: speecht5:
enabled: true
image: image:
repository: opea/speecht5-gaudi repository: opea/speecht5-gaudi
resources: resources:
limits: limits:
habana.ai/gaudi: 1 habana.ai/gaudi: 1
gpt-sovits:
enabled: false

View File

@@ -0,0 +1,12 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
redis-vector-db:
enabled: false
qdrant:
enabled: true
data-prep:
DATAPREP_BACKEND: "QDRANT"
retriever-usvc:
RETRIEVER_BACKEND: "QDRANT"

View File

@@ -44,11 +44,6 @@ tgi:
# cpu: 6 # cpu: 6
# memory: 65Gi # memory: 65Gi
livenessProbe:
initialDelaySeconds: 8
periodSeconds: 8
failureThreshold: 24
timeoutSeconds: 4
readinessProbe: readinessProbe:
initialDelaySeconds: 16 initialDelaySeconds: 16
periodSeconds: 8 periodSeconds: 8
@@ -65,17 +60,12 @@ teirerank:
# Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model: # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
resources: resources:
limits: limits:
cpu: 4 cpu: 24
memory: 30Gi memory: 30Gi
requests: requests:
cpu: 2 cpu: 2
memory: 25Gi memory: 25Gi
livenessProbe:
initialDelaySeconds: 8
periodSeconds: 8
failureThreshold: 24
timeoutSeconds: 4
readinessProbe: readinessProbe:
initialDelaySeconds: 8 initialDelaySeconds: 8
periodSeconds: 8 periodSeconds: 8
@@ -91,17 +81,12 @@ tei:
# Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model: # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
resources: resources:
limits: limits:
cpu: 4 cpu: 24
memory: 4Gi memory: 4Gi
requests: requests:
cpu: 2 cpu: 2
memory: 3Gi memory: 3Gi
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 24
timeoutSeconds: 2
readinessProbe: readinessProbe:
initialDelaySeconds: 5 initialDelaySeconds: 5
periodSeconds: 5 periodSeconds: 5

View File

@@ -0,0 +1,16 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
CHATQNA_TYPE: "CHATQNA_FAQGEN"
llm-uservice:
enabled: true
image:
repository: opea/llm-faqgen
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
FAQGEN_BACKEND: "TGI"
service:
port: 80
vllm:
enabled: false
tgi:
enabled: true

View File

@@ -0,0 +1,12 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
CHATQNA_TYPE: "CHATQNA_FAQGEN"
llm-uservice:
enabled: true
image:
repository: opea/llm-faqgen
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
FAQGEN_BACKEND: "vLLM"
service:
port: 80

View File

@@ -0,0 +1,60 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
CHATQNA_TYPE: "CHATQNA_FAQGEN"
llm-uservice:
enabled: true
image:
repository: opea/llm-faqgen
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
FAQGEN_BACKEND: "TGI"
service:
port: 80
vllm:
enabled: false
# TGI: largest bottleneck for ChatQnA
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
# higher limits are needed with extra input tokens added by rerank
MAX_INPUT_LENGTH: "2048"
MAX_TOTAL_TOKENS: "4096"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
# Reranking: second largest bottleneck when reranking is in use
# (i.e. query context docs have been uploaded with data-prep)
teirerank:
accelDevice: "gaudi"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
MAX_WARMUP_SEQUENCE_LENGTH: "512"
image:
repository: ghcr.io/huggingface/tei-gaudi
tag: 1.5.0
resources:
limits:
habana.ai/gaudi: 1
# securityContext:
# readOnlyRootFilesystem: false
readinessProbe:
timeoutSeconds: 1

View File

@@ -0,0 +1,53 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
CHATQNA_TYPE: "CHATQNA_FAQGEN"
llm-uservice:
enabled: true
image:
repository: opea/llm-faqgen
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
FAQGEN_BACKEND: "vLLM"
service:
port: 80
tgi:
enabled: false
vllm:
enabled: true
shmSize: 1Gi
accelDevice: "gaudi"
image:
repository: opea/vllm-gaudi
resources:
limits:
habana.ai/gaudi: 1
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
VLLM_SKIP_WARMUP: true
extraCmdArgs: [
"--tensor-parallel-size", "1",
"--block-size", "128",
"--max-num-seqs", "256",
"--max-seq_len-to-capture", "2048"
]
# Reranking: second largest bottleneck when reranking is in use
# (i.e. query context docs have been uploaded with data-prep)
#
# TODO: could vLLM be used also for reranking / embedding?
teirerank:
accelDevice: "gaudi"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
MAX_WARMUP_SEQUENCE_LENGTH: "512"
image:
repository: ghcr.io/huggingface/tei-gaudi
tag: 1.5.0
resources:
limits:
habana.ai/gaudi: 1
# securityContext:
# readOnlyRootFilesystem: false
readinessProbe:
timeoutSeconds: 1

View File

@@ -26,10 +26,6 @@ tgi:
USE_FLASH_ATTENTION: "true" USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true" FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe: readinessProbe:
initialDelaySeconds: 5 initialDelaySeconds: 5
periodSeconds: 5 periodSeconds: 5
@@ -54,8 +50,6 @@ teirerank:
habana.ai/gaudi: 1 habana.ai/gaudi: 1
securityContext: securityContext:
readOnlyRootFilesystem: false readOnlyRootFilesystem: false
livenessProbe:
timeoutSeconds: 1
readinessProbe: readinessProbe:
timeoutSeconds: 1 timeoutSeconds: 1
@@ -73,7 +67,3 @@ teirerank:
# habana.ai/gaudi: 1 # habana.ai/gaudi: 1
# securityContext: # securityContext:
# readOnlyRootFilesystem: false # readOnlyRootFilesystem: false
# livenessProbe:
# timeoutSeconds: 1
# readinessProbe:
# timeoutSeconds: 1

View File

@@ -16,18 +16,7 @@ vllm:
limits: limits:
habana.ai/gaudi: 1 habana.ai/gaudi: 1
startupProbe: startupProbe:
initialDelaySeconds: 5 failureThreshold: 360
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 180
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none" OMPI_MCA_btl_vader_single_copy_mechanism: "none"
@@ -55,7 +44,5 @@ teirerank:
habana.ai/gaudi: 1 habana.ai/gaudi: 1
securityContext: securityContext:
readOnlyRootFilesystem: false readOnlyRootFilesystem: false
livenessProbe:
timeoutSeconds: 1
readinessProbe: readinessProbe:
timeoutSeconds: 1 timeoutSeconds: 1

View File

@@ -1,9 +1,12 @@
# Copyright (C) 2024 Intel Corporation # Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
image: # Accelerate inferencing in heaviest components to improve performance
repository: opea/chatqna-guardrails # by overriding their subchart values
image:
repository: opea/chatqna
CHATQNA_TYPE: "CHATQNA_GUARDRAILS"
# guardrails related config # guardrails related config
guardrails-usvc: guardrails-usvc:
enabled: true enabled: true
@@ -22,10 +25,6 @@ guardrails-usvc:
# habana.ai/gaudi: 1 # habana.ai/gaudi: 1
# securityContext: # securityContext:
# readOnlyRootFilesystem: false # readOnlyRootFilesystem: false
# livenessProbe:
# timeoutSeconds: 1
# readinessProbe:
# timeoutSeconds: 1
teirerank: teirerank:
accelDevice: "gaudi" accelDevice: "gaudi"
@@ -39,8 +38,6 @@ teirerank:
habana.ai/gaudi: 1 habana.ai/gaudi: 1
securityContext: securityContext:
readOnlyRootFilesystem: false readOnlyRootFilesystem: false
livenessProbe:
timeoutSeconds: 1
readinessProbe: readinessProbe:
timeoutSeconds: 1 timeoutSeconds: 1
@@ -62,10 +59,6 @@ tgi-guardrails:
LIMIT_HPU_GRAPH: "true" LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true" USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true" FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe: readinessProbe:
initialDelaySeconds: 5 initialDelaySeconds: 5
periodSeconds: 5 periodSeconds: 5
@@ -88,18 +81,7 @@ vllm:
limits: limits:
habana.ai/gaudi: 1 habana.ai/gaudi: 1
startupProbe: startupProbe:
initialDelaySeconds: 5 failureThreshold: 360
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 180
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none" OMPI_MCA_btl_vader_single_copy_mechanism: "none"

View File

@@ -1,11 +1,7 @@
# Copyright (C) 2024 Intel Corporation # Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Accelerate inferencing in heaviest components to improve performance CHATQNA_TYPE: "CHATQNA_NO_RERANK"
# by overriding their subchart values
image:
repository: opea/chatqna-without-rerank
teirerank: teirerank:
enabled: false enabled: false

View File

@@ -0,0 +1,9 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
vllm:
enabled: false
llm-uservice:
TEXTGEN_BACKEND: TGI

View File

@@ -2,4 +2,8 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
tgi: tgi:
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct enabled: false
vllm:
enabled: true
llm-uservice:
TEXTGEN_BACKEND: vLLM

View File

@@ -0,0 +1,33 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
vllm:
enabled: false
llm-uservice:
TEXTGEN_BACKEND: TGI

View File

@@ -2,32 +2,26 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
tgi: tgi:
enabled: false
vllm:
enabled: true
accelDevice: "gaudi" accelDevice: "gaudi"
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
image: image:
repository: ghcr.io/huggingface/tgi-gaudi repository: opea/vllm-gaudi
tag: "2.3.1" PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
startupProbe:
failureThreshold: 360
resources: resources:
limits: limits:
habana.ai/gaudi: 1 habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024" extraCmdArgs: [
MAX_TOTAL_TOKENS: "2048" "--tensor-parallel-size", "1",
CUDA_GRAPHS: "" "--block-size", "128",
OMPI_MCA_btl_vader_single_copy_mechanism: "none" "--max-num-seqs", "256",
ENABLE_HPU_GRAPH: "true" ]
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true" llm-uservice:
FLASH_ATTENTION_RECOMPUTE: "true" TEXTGEN_BACKEND: vLLM
livenessProbe: retryTimeoutSeconds: 720
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120

View File

@@ -0,0 +1,9 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
vllm:
enabled: false
llm-uservice:
TEXTGEN_BACKEND: TGI

View File

@@ -2,4 +2,8 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
tgi: tgi:
LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3 enabled: false
vllm:
enabled: true
llm-uservice:
TEXTGEN_BACKEND: vLLM

View File

@@ -0,0 +1,33 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "2048"
MAX_TOTAL_TOKENS: "4096"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
vllm:
enabled: false
llm-uservice:
TEXTGEN_BACKEND: TGI

View File

@@ -1,32 +1,33 @@
# Copyright (C) 2024 Intel Corporation # Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values
tgi: tgi:
enabled: false
vllm:
enabled: true
accelDevice: "gaudi" accelDevice: "gaudi"
image: image:
repository: ghcr.io/huggingface/tgi-gaudi repository: opea/vllm-gaudi
tag: "2.3.1" startupProbe:
failureThreshold: 360
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
resources: resources:
limits: limits:
habana.ai/gaudi: 1 habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024" extraCmdArgs: [
MAX_TOTAL_TOKENS: "2048" "--tensor-parallel-size", "1",
CUDA_GRAPHS: "" "--block-size", "128",
OMPI_MCA_btl_vader_single_copy_mechanism: "none" "--max-num-seqs", "256",
ENABLE_HPU_GRAPH: "true" "--max-seq_len-to-capture", "2048"
LIMIT_HPU_GRAPH: "true" ]
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true" llm-uservice:
livenessProbe: TEXTGEN_BACKEND: vLLM
initialDelaySeconds: 5 retryTimeoutSeconds: 720
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120

View File

@@ -0,0 +1,9 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
llm-uservice:
DOCSUM_BACKEND: "TGI"
tgi:
enabled: true
vllm:
enabled: false

View File

@@ -1,7 +1,9 @@
# Copyright (C) 2024 Intel Corporation # Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
llm-uservice:
DOCSUM_BACKEND: "vLLM"
tgi: tgi:
enabled: true
vllm:
enabled: false enabled: false
vllm:
enabled: true

View File

@@ -0,0 +1,32 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
vllm:
enabled: false
llm-uservice:
DOCSUM_BACKEND: "TGI"
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
CUDA_GRAPHS: ""
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120

View File

@@ -1,36 +1,32 @@
# Copyright (C) 2024 Intel Corporation # Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
vllm: # Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values
tgi:
enabled: false enabled: false
llm-uservice: llm-uservice:
DOCSUM_BACKEND: "TGI" DOCSUM_BACKEND: "vLLM"
retryTimeoutSeconds: 720
tgi: vllm:
enabled: true enabled: true
accelDevice: "gaudi"
image: image:
repository: ghcr.io/huggingface/tgi-gaudi repository: opea/vllm-gaudi
tag: "2.3.1"
resources: resources:
limits: limits:
habana.ai/gaudi: 1 habana.ai/gaudi: 1
CUDA_GRAPHS: ""
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe: startupProbe:
initialDelaySeconds: 5 failureThreshold: 360
periodSeconds: 5
timeoutSeconds: 1 PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
failureThreshold: 120 OMPI_MCA_btl_vader_single_copy_mechanism: "none"
extraCmdArgs: [
"--tensor-parallel-size", "1",
"--block-size", "128",
"--max-num-seqs", "256",
"--max-seq_len-to-capture", "2048"
]

View File

@@ -18,10 +18,6 @@ tgi:
LIMIT_HPU_GRAPH: true LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true FLASH_ATTENTION_RECOMPUTE: true
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe: readinessProbe:
initialDelaySeconds: 5 initialDelaySeconds: 5
periodSeconds: 5 periodSeconds: 5
@@ -44,7 +40,5 @@ tei:
resources: resources:
limits: limits:
habana.ai/gaudi: 1 habana.ai/gaudi: 1
livenessProbe:
timeoutSeconds: 1
readinessProbe: readinessProbe:
timeoutSeconds: 1 timeoutSeconds: 1

View File

@@ -0,0 +1,9 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
vllm:
enabled: false
lvm-uservice:
LVM_BACKEND: "TGI"

View File

@@ -1,7 +1,9 @@
# Copyright (C) 2024 Intel Corporation # Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
vllm:
enabled: true
tgi: tgi:
MAX_INPUT_LENGTH: "4096" enabled: false
MAX_TOTAL_TOKENS: "8192" lvm-uservice:
LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf LVM_BACKEND: "vLLM"

View File

@@ -0,0 +1,37 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values
# TGI: largest bottleneck for VisualQnA
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "4096"
MAX_TOTAL_TOKENS: "8192"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
vllm:
enabled: false
lvm-uservice:
LVM_BACKEND: "TGI"

View File

@@ -1,36 +1,24 @@
# Copyright (C) 2024 Intel Corporation # Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Accelerate inferencing in heaviest components to improve performance vllm:
# by overriding their subchart values enabled: true
# TGI: largest bottleneck for VisualQnA
tgi:
accelDevice: "gaudi" accelDevice: "gaudi"
image: image:
repository: ghcr.io/huggingface/tgi-gaudi repository: opea/vllm-gaudi
tag: "2.3.1" tag: "latest"
LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
VLLM_SKIP_WARMUP: true
OMPI_MCA_btl_vader_single_copy_mechanism: none
PT_HPUGRAPH_DISABLE_TENSOR_CACHE: "false"
extraCmdArgs: ["--tensor-parallel-size","1","--chat-template","examples/template_llava.jinja"]
resources: resources:
limits: limits:
habana.ai/gaudi: 1 habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "4096" tgi:
MAX_TOTAL_TOKENS: "8192" enabled: false
CUDA_GRAPHS: "" lvm-uservice:
OMPI_MCA_btl_vader_single_copy_mechanism: "none" LVM_BACKEND: "vLLM"
ENABLE_HPU_GRAPH: "true" # The default model is not stable on Gaudi, use the older model.
LIMIT_HPU_GRAPH: "true" # https://github.com/HabanaAI/vllm-fork/issues/841
USE_FLASH_ATTENTION: "true" LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120