Sync values yaml file for 1.3 release (#1748)
Signed-off-by: Dolpher Du <dolpher.du@intel.com>
This commit is contained in:
22
AgentQnA/kubernetes/helm/cpu-values.yaml
Normal file
22
AgentQnA/kubernetes/helm/cpu-values.yaml
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
# Copyright (C) 2025 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
tgi:
|
||||||
|
enabled: false
|
||||||
|
vllm:
|
||||||
|
enabled: true
|
||||||
|
LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
|
extraCmdArgs: ["--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
|
||||||
|
|
||||||
|
supervisor:
|
||||||
|
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||||
|
llm_engine: vllm
|
||||||
|
model: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
|
ragagent:
|
||||||
|
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||||
|
llm_engine: vllm
|
||||||
|
model: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
|
sqlagent:
|
||||||
|
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||||
|
llm_engine: vllm
|
||||||
|
model: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
@@ -4,13 +4,32 @@
|
|||||||
# Accelerate inferencing in heaviest components to improve performance
|
# Accelerate inferencing in heaviest components to improve performance
|
||||||
# by overriding their subchart values
|
# by overriding their subchart values
|
||||||
|
|
||||||
|
tgi:
|
||||||
|
enabled: false
|
||||||
vllm:
|
vllm:
|
||||||
enabled: true
|
enabled: true
|
||||||
|
accelDevice: "gaudi"
|
||||||
image:
|
image:
|
||||||
repository: opea/vllm-gaudi
|
repository: opea/vllm-gaudi
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
habana.ai/gaudi: 4
|
||||||
|
LLM_MODEL_ID: "meta-llama/Llama-3.3-70B-Instruct"
|
||||||
|
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||||
|
PT_HPU_ENABLE_LAZY_COLLECTIVES: true
|
||||||
|
VLLM_SKIP_WARMUP: true
|
||||||
|
shmSize: 16Gi
|
||||||
|
extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
|
||||||
|
|
||||||
supervisor:
|
supervisor:
|
||||||
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||||
|
llm_engine: vllm
|
||||||
|
model: "meta-llama/Llama-3.3-70B-Instruct"
|
||||||
ragagent:
|
ragagent:
|
||||||
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||||
|
llm_engine: vllm
|
||||||
|
model: "meta-llama/Llama-3.3-70B-Instruct"
|
||||||
sqlagent:
|
sqlagent:
|
||||||
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||||
|
llm_engine: vllm
|
||||||
|
model: "meta-llama/Llama-3.3-70B-Instruct"
|
||||||
|
|||||||
15
AudioQnA/kubernetes/helm/cpu-multilang-values.yaml
Normal file
15
AudioQnA/kubernetes/helm/cpu-multilang-values.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# Copyright (C) 2024 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
tgi:
|
||||||
|
enabled: false
|
||||||
|
vllm:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
speecht5:
|
||||||
|
enabled: false
|
||||||
|
gpt-sovits:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
image:
|
||||||
|
repository: opea/audioqna-multilang
|
||||||
12
AudioQnA/kubernetes/helm/cpu-tgi-values.yaml
Normal file
12
AudioQnA/kubernetes/helm/cpu-tgi-values.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# Copyright (C) 2024 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
tgi:
|
||||||
|
enabled: true
|
||||||
|
vllm:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
speecht5:
|
||||||
|
enabled: true
|
||||||
|
gpt-sovits:
|
||||||
|
enabled: false
|
||||||
@@ -2,4 +2,11 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
tgi:
|
tgi:
|
||||||
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
|
enabled: false
|
||||||
|
vllm:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
speecht5:
|
||||||
|
enabled: true
|
||||||
|
gpt-sovits:
|
||||||
|
enabled: false
|
||||||
|
|||||||
49
AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
49
AudioQnA/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
# Copyright (C) 2024 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
tgi:
|
||||||
|
enabled: true
|
||||||
|
accelDevice: "gaudi"
|
||||||
|
image:
|
||||||
|
repository: ghcr.io/huggingface/tgi-gaudi
|
||||||
|
tag: "2.3.1"
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
habana.ai/gaudi: 1
|
||||||
|
MAX_INPUT_LENGTH: "1024"
|
||||||
|
MAX_TOTAL_TOKENS: "2048"
|
||||||
|
CUDA_GRAPHS: ""
|
||||||
|
HF_HUB_DISABLE_PROGRESS_BARS: 1
|
||||||
|
HF_HUB_ENABLE_HF_TRANSFER: 0
|
||||||
|
ENABLE_HPU_GRAPH: true
|
||||||
|
LIMIT_HPU_GRAPH: true
|
||||||
|
USE_FLASH_ATTENTION: true
|
||||||
|
FLASH_ATTENTION_RECOMPUTE: true
|
||||||
|
readinessProbe:
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
timeoutSeconds: 1
|
||||||
|
startupProbe:
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
timeoutSeconds: 1
|
||||||
|
failureThreshold: 120
|
||||||
|
vllm:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
whisper:
|
||||||
|
image:
|
||||||
|
repository: opea/whisper-gaudi
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
habana.ai/gaudi: 1
|
||||||
|
|
||||||
|
speecht5:
|
||||||
|
enabled: true
|
||||||
|
image:
|
||||||
|
repository: opea/speecht5-gaudi
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
habana.ai/gaudi: 1
|
||||||
|
gpt-sovits:
|
||||||
|
enabled: false
|
||||||
@@ -2,35 +2,27 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
tgi:
|
tgi:
|
||||||
|
enabled: false
|
||||||
|
vllm:
|
||||||
|
enabled: true
|
||||||
accelDevice: "gaudi"
|
accelDevice: "gaudi"
|
||||||
image:
|
image:
|
||||||
repository: ghcr.io/huggingface/tgi-gaudi
|
repository: opea/vllm-gaudi
|
||||||
tag: "2.3.1"
|
startupProbe:
|
||||||
|
failureThreshold: 360
|
||||||
|
|
||||||
|
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||||
|
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||||
|
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
habana.ai/gaudi: 1
|
habana.ai/gaudi: 1
|
||||||
MAX_INPUT_LENGTH: "1024"
|
extraCmdArgs: [
|
||||||
MAX_TOTAL_TOKENS: "2048"
|
"--tensor-parallel-size", "1",
|
||||||
CUDA_GRAPHS: ""
|
"--block-size", "128",
|
||||||
HF_HUB_DISABLE_PROGRESS_BARS: 1
|
"--max-num-seqs", "256",
|
||||||
HF_HUB_ENABLE_HF_TRANSFER: 0
|
"--max-seq_len-to-capture", "2048"
|
||||||
ENABLE_HPU_GRAPH: true
|
]
|
||||||
LIMIT_HPU_GRAPH: true
|
|
||||||
USE_FLASH_ATTENTION: true
|
|
||||||
FLASH_ATTENTION_RECOMPUTE: true
|
|
||||||
livenessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
readinessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
startupProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
failureThreshold: 120
|
|
||||||
|
|
||||||
whisper:
|
whisper:
|
||||||
image:
|
image:
|
||||||
@@ -40,8 +32,11 @@ whisper:
|
|||||||
habana.ai/gaudi: 1
|
habana.ai/gaudi: 1
|
||||||
|
|
||||||
speecht5:
|
speecht5:
|
||||||
|
enabled: true
|
||||||
image:
|
image:
|
||||||
repository: opea/speecht5-gaudi
|
repository: opea/speecht5-gaudi
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
habana.ai/gaudi: 1
|
habana.ai/gaudi: 1
|
||||||
|
gpt-sovits:
|
||||||
|
enabled: false
|
||||||
|
|||||||
12
ChatQnA/kubernetes/helm/cpu-qdrant-values.yaml
Normal file
12
ChatQnA/kubernetes/helm/cpu-qdrant-values.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# Copyright (C) 2025 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
redis-vector-db:
|
||||||
|
enabled: false
|
||||||
|
qdrant:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
data-prep:
|
||||||
|
DATAPREP_BACKEND: "QDRANT"
|
||||||
|
retriever-usvc:
|
||||||
|
RETRIEVER_BACKEND: "QDRANT"
|
||||||
@@ -44,11 +44,6 @@ tgi:
|
|||||||
# cpu: 6
|
# cpu: 6
|
||||||
# memory: 65Gi
|
# memory: 65Gi
|
||||||
|
|
||||||
livenessProbe:
|
|
||||||
initialDelaySeconds: 8
|
|
||||||
periodSeconds: 8
|
|
||||||
failureThreshold: 24
|
|
||||||
timeoutSeconds: 4
|
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
initialDelaySeconds: 16
|
initialDelaySeconds: 16
|
||||||
periodSeconds: 8
|
periodSeconds: 8
|
||||||
@@ -65,17 +60,12 @@ teirerank:
|
|||||||
# Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
|
# Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model:
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
cpu: 4
|
cpu: 24
|
||||||
memory: 30Gi
|
memory: 30Gi
|
||||||
requests:
|
requests:
|
||||||
cpu: 2
|
cpu: 2
|
||||||
memory: 25Gi
|
memory: 25Gi
|
||||||
|
|
||||||
livenessProbe:
|
|
||||||
initialDelaySeconds: 8
|
|
||||||
periodSeconds: 8
|
|
||||||
failureThreshold: 24
|
|
||||||
timeoutSeconds: 4
|
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
initialDelaySeconds: 8
|
initialDelaySeconds: 8
|
||||||
periodSeconds: 8
|
periodSeconds: 8
|
||||||
@@ -91,17 +81,12 @@ tei:
|
|||||||
# Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
|
# Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model:
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
cpu: 4
|
cpu: 24
|
||||||
memory: 4Gi
|
memory: 4Gi
|
||||||
requests:
|
requests:
|
||||||
cpu: 2
|
cpu: 2
|
||||||
memory: 3Gi
|
memory: 3Gi
|
||||||
|
|
||||||
livenessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
failureThreshold: 24
|
|
||||||
timeoutSeconds: 2
|
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
initialDelaySeconds: 5
|
initialDelaySeconds: 5
|
||||||
periodSeconds: 5
|
periodSeconds: 5
|
||||||
|
|||||||
16
ChatQnA/kubernetes/helm/faqgen-cpu-tgi-values.yaml
Normal file
16
ChatQnA/kubernetes/helm/faqgen-cpu-tgi-values.yaml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
# Copyright (C) 2025 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
CHATQNA_TYPE: "CHATQNA_FAQGEN"
|
||||||
|
llm-uservice:
|
||||||
|
enabled: true
|
||||||
|
image:
|
||||||
|
repository: opea/llm-faqgen
|
||||||
|
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
|
FAQGEN_BACKEND: "TGI"
|
||||||
|
service:
|
||||||
|
port: 80
|
||||||
|
vllm:
|
||||||
|
enabled: false
|
||||||
|
tgi:
|
||||||
|
enabled: true
|
||||||
12
ChatQnA/kubernetes/helm/faqgen-cpu-values.yaml
Normal file
12
ChatQnA/kubernetes/helm/faqgen-cpu-values.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# Copyright (C) 2025 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
CHATQNA_TYPE: "CHATQNA_FAQGEN"
|
||||||
|
llm-uservice:
|
||||||
|
enabled: true
|
||||||
|
image:
|
||||||
|
repository: opea/llm-faqgen
|
||||||
|
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
|
FAQGEN_BACKEND: "vLLM"
|
||||||
|
service:
|
||||||
|
port: 80
|
||||||
60
ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml
Normal file
60
ChatQnA/kubernetes/helm/faqgen-gaudi-tgi-values.yaml
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
# Copyright (C) 2025 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
CHATQNA_TYPE: "CHATQNA_FAQGEN"
|
||||||
|
llm-uservice:
|
||||||
|
enabled: true
|
||||||
|
image:
|
||||||
|
repository: opea/llm-faqgen
|
||||||
|
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
|
FAQGEN_BACKEND: "TGI"
|
||||||
|
service:
|
||||||
|
port: 80
|
||||||
|
vllm:
|
||||||
|
enabled: false
|
||||||
|
# TGI: largest bottleneck for ChatQnA
|
||||||
|
tgi:
|
||||||
|
enabled: true
|
||||||
|
accelDevice: "gaudi"
|
||||||
|
image:
|
||||||
|
repository: ghcr.io/huggingface/tgi-gaudi
|
||||||
|
tag: "2.3.1"
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
habana.ai/gaudi: 1
|
||||||
|
# higher limits are needed with extra input tokens added by rerank
|
||||||
|
MAX_INPUT_LENGTH: "2048"
|
||||||
|
MAX_TOTAL_TOKENS: "4096"
|
||||||
|
CUDA_GRAPHS: ""
|
||||||
|
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||||
|
ENABLE_HPU_GRAPH: "true"
|
||||||
|
LIMIT_HPU_GRAPH: "true"
|
||||||
|
USE_FLASH_ATTENTION: "true"
|
||||||
|
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||||
|
|
||||||
|
readinessProbe:
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
timeoutSeconds: 1
|
||||||
|
startupProbe:
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
timeoutSeconds: 1
|
||||||
|
failureThreshold: 120
|
||||||
|
|
||||||
|
# Reranking: second largest bottleneck when reranking is in use
|
||||||
|
# (i.e. query context docs have been uploaded with data-prep)
|
||||||
|
teirerank:
|
||||||
|
accelDevice: "gaudi"
|
||||||
|
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||||
|
MAX_WARMUP_SEQUENCE_LENGTH: "512"
|
||||||
|
image:
|
||||||
|
repository: ghcr.io/huggingface/tei-gaudi
|
||||||
|
tag: 1.5.0
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
habana.ai/gaudi: 1
|
||||||
|
# securityContext:
|
||||||
|
# readOnlyRootFilesystem: false
|
||||||
|
readinessProbe:
|
||||||
|
timeoutSeconds: 1
|
||||||
53
ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml
Normal file
53
ChatQnA/kubernetes/helm/faqgen-gaudi-values.yaml
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
# Copyright (C) 2025 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
CHATQNA_TYPE: "CHATQNA_FAQGEN"
|
||||||
|
llm-uservice:
|
||||||
|
enabled: true
|
||||||
|
image:
|
||||||
|
repository: opea/llm-faqgen
|
||||||
|
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
|
FAQGEN_BACKEND: "vLLM"
|
||||||
|
service:
|
||||||
|
port: 80
|
||||||
|
tgi:
|
||||||
|
enabled: false
|
||||||
|
vllm:
|
||||||
|
enabled: true
|
||||||
|
shmSize: 1Gi
|
||||||
|
accelDevice: "gaudi"
|
||||||
|
image:
|
||||||
|
repository: opea/vllm-gaudi
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
habana.ai/gaudi: 1
|
||||||
|
|
||||||
|
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||||
|
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||||
|
VLLM_SKIP_WARMUP: true
|
||||||
|
|
||||||
|
extraCmdArgs: [
|
||||||
|
"--tensor-parallel-size", "1",
|
||||||
|
"--block-size", "128",
|
||||||
|
"--max-num-seqs", "256",
|
||||||
|
"--max-seq_len-to-capture", "2048"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Reranking: second largest bottleneck when reranking is in use
|
||||||
|
# (i.e. query context docs have been uploaded with data-prep)
|
||||||
|
#
|
||||||
|
# TODO: could vLLM be used also for reranking / embedding?
|
||||||
|
teirerank:
|
||||||
|
accelDevice: "gaudi"
|
||||||
|
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||||
|
MAX_WARMUP_SEQUENCE_LENGTH: "512"
|
||||||
|
image:
|
||||||
|
repository: ghcr.io/huggingface/tei-gaudi
|
||||||
|
tag: 1.5.0
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
habana.ai/gaudi: 1
|
||||||
|
# securityContext:
|
||||||
|
# readOnlyRootFilesystem: false
|
||||||
|
readinessProbe:
|
||||||
|
timeoutSeconds: 1
|
||||||
@@ -26,10 +26,6 @@ tgi:
|
|||||||
USE_FLASH_ATTENTION: "true"
|
USE_FLASH_ATTENTION: "true"
|
||||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||||
|
|
||||||
livenessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
initialDelaySeconds: 5
|
initialDelaySeconds: 5
|
||||||
periodSeconds: 5
|
periodSeconds: 5
|
||||||
@@ -54,8 +50,6 @@ teirerank:
|
|||||||
habana.ai/gaudi: 1
|
habana.ai/gaudi: 1
|
||||||
securityContext:
|
securityContext:
|
||||||
readOnlyRootFilesystem: false
|
readOnlyRootFilesystem: false
|
||||||
livenessProbe:
|
|
||||||
timeoutSeconds: 1
|
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
timeoutSeconds: 1
|
timeoutSeconds: 1
|
||||||
|
|
||||||
@@ -73,7 +67,3 @@ teirerank:
|
|||||||
# habana.ai/gaudi: 1
|
# habana.ai/gaudi: 1
|
||||||
# securityContext:
|
# securityContext:
|
||||||
# readOnlyRootFilesystem: false
|
# readOnlyRootFilesystem: false
|
||||||
# livenessProbe:
|
|
||||||
# timeoutSeconds: 1
|
|
||||||
# readinessProbe:
|
|
||||||
# timeoutSeconds: 1
|
|
||||||
|
|||||||
@@ -16,18 +16,7 @@ vllm:
|
|||||||
limits:
|
limits:
|
||||||
habana.ai/gaudi: 1
|
habana.ai/gaudi: 1
|
||||||
startupProbe:
|
startupProbe:
|
||||||
initialDelaySeconds: 5
|
failureThreshold: 360
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
failureThreshold: 180
|
|
||||||
readinessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
livenessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
|
|
||||||
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||||
@@ -55,7 +44,5 @@ teirerank:
|
|||||||
habana.ai/gaudi: 1
|
habana.ai/gaudi: 1
|
||||||
securityContext:
|
securityContext:
|
||||||
readOnlyRootFilesystem: false
|
readOnlyRootFilesystem: false
|
||||||
livenessProbe:
|
|
||||||
timeoutSeconds: 1
|
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
timeoutSeconds: 1
|
timeoutSeconds: 1
|
||||||
@@ -1,9 +1,12 @@
|
|||||||
# Copyright (C) 2024 Intel Corporation
|
# Copyright (C) 2025 Intel Corporation
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
image:
|
# Accelerate inferencing in heaviest components to improve performance
|
||||||
repository: opea/chatqna-guardrails
|
# by overriding their subchart values
|
||||||
|
|
||||||
|
image:
|
||||||
|
repository: opea/chatqna
|
||||||
|
CHATQNA_TYPE: "CHATQNA_GUARDRAILS"
|
||||||
# guardrails related config
|
# guardrails related config
|
||||||
guardrails-usvc:
|
guardrails-usvc:
|
||||||
enabled: true
|
enabled: true
|
||||||
@@ -22,10 +25,6 @@ guardrails-usvc:
|
|||||||
# habana.ai/gaudi: 1
|
# habana.ai/gaudi: 1
|
||||||
# securityContext:
|
# securityContext:
|
||||||
# readOnlyRootFilesystem: false
|
# readOnlyRootFilesystem: false
|
||||||
# livenessProbe:
|
|
||||||
# timeoutSeconds: 1
|
|
||||||
# readinessProbe:
|
|
||||||
# timeoutSeconds: 1
|
|
||||||
|
|
||||||
teirerank:
|
teirerank:
|
||||||
accelDevice: "gaudi"
|
accelDevice: "gaudi"
|
||||||
@@ -39,8 +38,6 @@ teirerank:
|
|||||||
habana.ai/gaudi: 1
|
habana.ai/gaudi: 1
|
||||||
securityContext:
|
securityContext:
|
||||||
readOnlyRootFilesystem: false
|
readOnlyRootFilesystem: false
|
||||||
livenessProbe:
|
|
||||||
timeoutSeconds: 1
|
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
timeoutSeconds: 1
|
timeoutSeconds: 1
|
||||||
|
|
||||||
@@ -62,10 +59,6 @@ tgi-guardrails:
|
|||||||
LIMIT_HPU_GRAPH: "true"
|
LIMIT_HPU_GRAPH: "true"
|
||||||
USE_FLASH_ATTENTION: "true"
|
USE_FLASH_ATTENTION: "true"
|
||||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||||
livenessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
initialDelaySeconds: 5
|
initialDelaySeconds: 5
|
||||||
periodSeconds: 5
|
periodSeconds: 5
|
||||||
@@ -88,18 +81,7 @@ vllm:
|
|||||||
limits:
|
limits:
|
||||||
habana.ai/gaudi: 1
|
habana.ai/gaudi: 1
|
||||||
startupProbe:
|
startupProbe:
|
||||||
initialDelaySeconds: 5
|
failureThreshold: 360
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
failureThreshold: 180
|
|
||||||
readinessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
livenessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
|
|
||||||
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||||
|
|||||||
@@ -1,11 +1,7 @@
|
|||||||
# Copyright (C) 2024 Intel Corporation
|
# Copyright (C) 2025 Intel Corporation
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
# Accelerate inferencing in heaviest components to improve performance
|
CHATQNA_TYPE: "CHATQNA_NO_RERANK"
|
||||||
# by overriding their subchart values
|
|
||||||
|
|
||||||
image:
|
|
||||||
repository: opea/chatqna-without-rerank
|
|
||||||
|
|
||||||
teirerank:
|
teirerank:
|
||||||
enabled: false
|
enabled: false
|
||||||
|
|||||||
9
CodeGen/kubernetes/helm/cpu-tgi-values.yaml
Normal file
9
CodeGen/kubernetes/helm/cpu-tgi-values.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
# Copyright (C) 2024 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
tgi:
|
||||||
|
enabled: true
|
||||||
|
vllm:
|
||||||
|
enabled: false
|
||||||
|
llm-uservice:
|
||||||
|
TEXTGEN_BACKEND: TGI
|
||||||
@@ -2,4 +2,8 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
tgi:
|
tgi:
|
||||||
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
|
enabled: false
|
||||||
|
vllm:
|
||||||
|
enabled: true
|
||||||
|
llm-uservice:
|
||||||
|
TEXTGEN_BACKEND: vLLM
|
||||||
|
|||||||
33
CodeGen/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
33
CodeGen/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
# Copyright (C) 2024 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
tgi:
|
||||||
|
enabled: true
|
||||||
|
accelDevice: "gaudi"
|
||||||
|
image:
|
||||||
|
repository: ghcr.io/huggingface/tgi-gaudi
|
||||||
|
tag: "2.3.1"
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
habana.ai/gaudi: 1
|
||||||
|
MAX_INPUT_LENGTH: "1024"
|
||||||
|
MAX_TOTAL_TOKENS: "2048"
|
||||||
|
CUDA_GRAPHS: ""
|
||||||
|
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||||
|
ENABLE_HPU_GRAPH: "true"
|
||||||
|
LIMIT_HPU_GRAPH: "true"
|
||||||
|
USE_FLASH_ATTENTION: "true"
|
||||||
|
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||||
|
readinessProbe:
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
timeoutSeconds: 1
|
||||||
|
startupProbe:
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
timeoutSeconds: 1
|
||||||
|
failureThreshold: 120
|
||||||
|
vllm:
|
||||||
|
enabled: false
|
||||||
|
llm-uservice:
|
||||||
|
TEXTGEN_BACKEND: TGI
|
||||||
@@ -2,32 +2,26 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
tgi:
|
tgi:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
vllm:
|
||||||
|
enabled: true
|
||||||
accelDevice: "gaudi"
|
accelDevice: "gaudi"
|
||||||
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
|
|
||||||
image:
|
image:
|
||||||
repository: ghcr.io/huggingface/tgi-gaudi
|
repository: opea/vllm-gaudi
|
||||||
tag: "2.3.1"
|
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||||
|
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||||
|
startupProbe:
|
||||||
|
failureThreshold: 360
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
habana.ai/gaudi: 1
|
habana.ai/gaudi: 1
|
||||||
MAX_INPUT_LENGTH: "1024"
|
extraCmdArgs: [
|
||||||
MAX_TOTAL_TOKENS: "2048"
|
"--tensor-parallel-size", "1",
|
||||||
CUDA_GRAPHS: ""
|
"--block-size", "128",
|
||||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
"--max-num-seqs", "256",
|
||||||
ENABLE_HPU_GRAPH: "true"
|
]
|
||||||
LIMIT_HPU_GRAPH: "true"
|
|
||||||
USE_FLASH_ATTENTION: "true"
|
llm-uservice:
|
||||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
TEXTGEN_BACKEND: vLLM
|
||||||
livenessProbe:
|
retryTimeoutSeconds: 720
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
readinessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
startupProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
failureThreshold: 120
|
|
||||||
|
|||||||
9
CodeTrans/kubernetes/helm/cpu-tgi-values.yaml
Normal file
9
CodeTrans/kubernetes/helm/cpu-tgi-values.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
# Copyright (C) 2024 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
tgi:
|
||||||
|
enabled: true
|
||||||
|
vllm:
|
||||||
|
enabled: false
|
||||||
|
llm-uservice:
|
||||||
|
TEXTGEN_BACKEND: TGI
|
||||||
@@ -2,4 +2,8 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
tgi:
|
tgi:
|
||||||
LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3
|
enabled: false
|
||||||
|
vllm:
|
||||||
|
enabled: true
|
||||||
|
llm-uservice:
|
||||||
|
TEXTGEN_BACKEND: vLLM
|
||||||
|
|||||||
33
CodeTrans/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
33
CodeTrans/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
# Copyright (C) 2024 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
tgi:
|
||||||
|
enabled: true
|
||||||
|
accelDevice: "gaudi"
|
||||||
|
image:
|
||||||
|
repository: ghcr.io/huggingface/tgi-gaudi
|
||||||
|
tag: "2.3.1"
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
habana.ai/gaudi: 1
|
||||||
|
MAX_INPUT_LENGTH: "2048"
|
||||||
|
MAX_TOTAL_TOKENS: "4096"
|
||||||
|
CUDA_GRAPHS: ""
|
||||||
|
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||||
|
ENABLE_HPU_GRAPH: "true"
|
||||||
|
LIMIT_HPU_GRAPH: "true"
|
||||||
|
USE_FLASH_ATTENTION: "true"
|
||||||
|
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||||
|
readinessProbe:
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
timeoutSeconds: 1
|
||||||
|
startupProbe:
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
timeoutSeconds: 1
|
||||||
|
failureThreshold: 120
|
||||||
|
vllm:
|
||||||
|
enabled: false
|
||||||
|
llm-uservice:
|
||||||
|
TEXTGEN_BACKEND: TGI
|
||||||
@@ -1,32 +1,33 @@
|
|||||||
# Copyright (C) 2024 Intel Corporation
|
# Copyright (C) 2024 Intel Corporation
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
# Accelerate inferencing in heaviest components to improve performance
|
||||||
|
# by overriding their subchart values
|
||||||
|
|
||||||
tgi:
|
tgi:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
vllm:
|
||||||
|
enabled: true
|
||||||
accelDevice: "gaudi"
|
accelDevice: "gaudi"
|
||||||
image:
|
image:
|
||||||
repository: ghcr.io/huggingface/tgi-gaudi
|
repository: opea/vllm-gaudi
|
||||||
tag: "2.3.1"
|
startupProbe:
|
||||||
|
failureThreshold: 360
|
||||||
|
|
||||||
|
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||||
|
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||||
|
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
habana.ai/gaudi: 1
|
habana.ai/gaudi: 1
|
||||||
MAX_INPUT_LENGTH: "1024"
|
extraCmdArgs: [
|
||||||
MAX_TOTAL_TOKENS: "2048"
|
"--tensor-parallel-size", "1",
|
||||||
CUDA_GRAPHS: ""
|
"--block-size", "128",
|
||||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
"--max-num-seqs", "256",
|
||||||
ENABLE_HPU_GRAPH: "true"
|
"--max-seq_len-to-capture", "2048"
|
||||||
LIMIT_HPU_GRAPH: "true"
|
]
|
||||||
USE_FLASH_ATTENTION: "true"
|
|
||||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
llm-uservice:
|
||||||
livenessProbe:
|
TEXTGEN_BACKEND: vLLM
|
||||||
initialDelaySeconds: 5
|
retryTimeoutSeconds: 720
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
readinessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
startupProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
failureThreshold: 120
|
|
||||||
|
|||||||
9
DocSum/kubernetes/helm/cpu-tgi-values.yaml
Normal file
9
DocSum/kubernetes/helm/cpu-tgi-values.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
# Copyright (C) 2025 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
llm-uservice:
|
||||||
|
DOCSUM_BACKEND: "TGI"
|
||||||
|
tgi:
|
||||||
|
enabled: true
|
||||||
|
vllm:
|
||||||
|
enabled: false
|
||||||
@@ -1,7 +1,9 @@
|
|||||||
# Copyright (C) 2024 Intel Corporation
|
# Copyright (C) 2025 Intel Corporation
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
llm-uservice:
|
||||||
|
DOCSUM_BACKEND: "vLLM"
|
||||||
tgi:
|
tgi:
|
||||||
enabled: true
|
|
||||||
vllm:
|
|
||||||
enabled: false
|
enabled: false
|
||||||
|
vllm:
|
||||||
|
enabled: true
|
||||||
|
|||||||
32
DocSum/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
32
DocSum/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
# Copyright (C) 2025 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
vllm:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
llm-uservice:
|
||||||
|
DOCSUM_BACKEND: "TGI"
|
||||||
|
|
||||||
|
tgi:
|
||||||
|
enabled: true
|
||||||
|
accelDevice: "gaudi"
|
||||||
|
image:
|
||||||
|
repository: ghcr.io/huggingface/tgi-gaudi
|
||||||
|
tag: "2.3.1"
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
habana.ai/gaudi: 1
|
||||||
|
CUDA_GRAPHS: ""
|
||||||
|
ENABLE_HPU_GRAPH: true
|
||||||
|
LIMIT_HPU_GRAPH: true
|
||||||
|
USE_FLASH_ATTENTION: true
|
||||||
|
FLASH_ATTENTION_RECOMPUTE: true
|
||||||
|
readinessProbe:
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
timeoutSeconds: 1
|
||||||
|
startupProbe:
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
timeoutSeconds: 1
|
||||||
|
failureThreshold: 120
|
||||||
@@ -1,36 +1,32 @@
|
|||||||
# Copyright (C) 2024 Intel Corporation
|
# Copyright (C) 2025 Intel Corporation
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
vllm:
|
# Accelerate inferencing in heaviest components to improve performance
|
||||||
|
# by overriding their subchart values
|
||||||
|
|
||||||
|
tgi:
|
||||||
enabled: false
|
enabled: false
|
||||||
|
|
||||||
llm-uservice:
|
llm-uservice:
|
||||||
DOCSUM_BACKEND: "TGI"
|
DOCSUM_BACKEND: "vLLM"
|
||||||
|
retryTimeoutSeconds: 720
|
||||||
|
|
||||||
tgi:
|
vllm:
|
||||||
enabled: true
|
enabled: true
|
||||||
accelDevice: "gaudi"
|
|
||||||
image:
|
image:
|
||||||
repository: ghcr.io/huggingface/tgi-gaudi
|
repository: opea/vllm-gaudi
|
||||||
tag: "2.3.1"
|
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
habana.ai/gaudi: 1
|
habana.ai/gaudi: 1
|
||||||
CUDA_GRAPHS: ""
|
|
||||||
ENABLE_HPU_GRAPH: true
|
|
||||||
LIMIT_HPU_GRAPH: true
|
|
||||||
USE_FLASH_ATTENTION: true
|
|
||||||
FLASH_ATTENTION_RECOMPUTE: true
|
|
||||||
livenessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
readinessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
startupProbe:
|
startupProbe:
|
||||||
initialDelaySeconds: 5
|
failureThreshold: 360
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
|
||||||
failureThreshold: 120
|
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||||
|
|
||||||
|
extraCmdArgs: [
|
||||||
|
"--tensor-parallel-size", "1",
|
||||||
|
"--block-size", "128",
|
||||||
|
"--max-num-seqs", "256",
|
||||||
|
"--max-seq_len-to-capture", "2048"
|
||||||
|
]
|
||||||
|
|||||||
@@ -18,10 +18,6 @@ tgi:
|
|||||||
LIMIT_HPU_GRAPH: true
|
LIMIT_HPU_GRAPH: true
|
||||||
USE_FLASH_ATTENTION: true
|
USE_FLASH_ATTENTION: true
|
||||||
FLASH_ATTENTION_RECOMPUTE: true
|
FLASH_ATTENTION_RECOMPUTE: true
|
||||||
livenessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
initialDelaySeconds: 5
|
initialDelaySeconds: 5
|
||||||
periodSeconds: 5
|
periodSeconds: 5
|
||||||
@@ -44,7 +40,5 @@ tei:
|
|||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
habana.ai/gaudi: 1
|
habana.ai/gaudi: 1
|
||||||
livenessProbe:
|
|
||||||
timeoutSeconds: 1
|
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
timeoutSeconds: 1
|
timeoutSeconds: 1
|
||||||
|
|||||||
9
VisualQnA/kubernetes/helm/cpu-tgi-values.yaml
Normal file
9
VisualQnA/kubernetes/helm/cpu-tgi-values.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
# Copyright (C) 2025 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
tgi:
|
||||||
|
enabled: true
|
||||||
|
vllm:
|
||||||
|
enabled: false
|
||||||
|
lvm-uservice:
|
||||||
|
LVM_BACKEND: "TGI"
|
||||||
@@ -1,7 +1,9 @@
|
|||||||
# Copyright (C) 2024 Intel Corporation
|
# Copyright (C) 2025 Intel Corporation
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
vllm:
|
||||||
|
enabled: true
|
||||||
tgi:
|
tgi:
|
||||||
MAX_INPUT_LENGTH: "4096"
|
enabled: false
|
||||||
MAX_TOTAL_TOKENS: "8192"
|
lvm-uservice:
|
||||||
LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf
|
LVM_BACKEND: "vLLM"
|
||||||
|
|||||||
37
VisualQnA/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
37
VisualQnA/kubernetes/helm/gaudi-tgi-values.yaml
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
# Copyright (C) 2025 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
# Accelerate inferencing in heaviest components to improve performance
|
||||||
|
# by overriding their subchart values
|
||||||
|
|
||||||
|
# TGI: largest bottleneck for VisualQnA
|
||||||
|
tgi:
|
||||||
|
enabled: true
|
||||||
|
accelDevice: "gaudi"
|
||||||
|
image:
|
||||||
|
repository: ghcr.io/huggingface/tgi-gaudi
|
||||||
|
tag: "2.3.1"
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
habana.ai/gaudi: 1
|
||||||
|
MAX_INPUT_LENGTH: "4096"
|
||||||
|
MAX_TOTAL_TOKENS: "8192"
|
||||||
|
CUDA_GRAPHS: ""
|
||||||
|
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
||||||
|
ENABLE_HPU_GRAPH: "true"
|
||||||
|
LIMIT_HPU_GRAPH: "true"
|
||||||
|
USE_FLASH_ATTENTION: "true"
|
||||||
|
FLASH_ATTENTION_RECOMPUTE: "true"
|
||||||
|
readinessProbe:
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
timeoutSeconds: 1
|
||||||
|
startupProbe:
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
timeoutSeconds: 1
|
||||||
|
failureThreshold: 120
|
||||||
|
vllm:
|
||||||
|
enabled: false
|
||||||
|
lvm-uservice:
|
||||||
|
LVM_BACKEND: "TGI"
|
||||||
@@ -1,36 +1,24 @@
|
|||||||
# Copyright (C) 2024 Intel Corporation
|
# Copyright (C) 2025 Intel Corporation
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
# Accelerate inferencing in heaviest components to improve performance
|
vllm:
|
||||||
# by overriding their subchart values
|
enabled: true
|
||||||
|
|
||||||
# TGI: largest bottleneck for VisualQnA
|
|
||||||
tgi:
|
|
||||||
accelDevice: "gaudi"
|
accelDevice: "gaudi"
|
||||||
image:
|
image:
|
||||||
repository: ghcr.io/huggingface/tgi-gaudi
|
repository: opea/vllm-gaudi
|
||||||
tag: "2.3.1"
|
tag: "latest"
|
||||||
|
LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
|
||||||
|
VLLM_SKIP_WARMUP: true
|
||||||
|
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||||
|
PT_HPUGRAPH_DISABLE_TENSOR_CACHE: "false"
|
||||||
|
extraCmdArgs: ["--tensor-parallel-size","1","--chat-template","examples/template_llava.jinja"]
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
habana.ai/gaudi: 1
|
habana.ai/gaudi: 1
|
||||||
MAX_INPUT_LENGTH: "4096"
|
tgi:
|
||||||
MAX_TOTAL_TOKENS: "8192"
|
enabled: false
|
||||||
CUDA_GRAPHS: ""
|
lvm-uservice:
|
||||||
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
|
LVM_BACKEND: "vLLM"
|
||||||
ENABLE_HPU_GRAPH: "true"
|
# The default model is not stable on Gaudi, use the older model.
|
||||||
LIMIT_HPU_GRAPH: "true"
|
# https://github.com/HabanaAI/vllm-fork/issues/841
|
||||||
USE_FLASH_ATTENTION: "true"
|
LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
|
||||||
FLASH_ATTENTION_RECOMPUTE: "true"
|
|
||||||
livenessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
readinessProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
startupProbe:
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 5
|
|
||||||
timeoutSeconds: 1
|
|
||||||
failureThreshold: 120
|
|
||||||
|
|||||||
Reference in New Issue
Block a user