Sync values yaml file for 1.3 release (#1748)

Signed-off-by: Dolpher Du <dolpher.du@intel.com>
This commit is contained in:
dolpher
2025-04-08 22:39:40 +08:00
committed by GitHub
parent b14db6dbd3
commit 46ebb78aa3
34 changed files with 580 additions and 212 deletions

View File

@@ -0,0 +1,9 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
vllm:
enabled: false
llm-uservice:
TEXTGEN_BACKEND: TGI

View File

@@ -2,4 +2,8 @@
# SPDX-License-Identifier: Apache-2.0
tgi:
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
enabled: false
vllm:
enabled: true
llm-uservice:
TEXTGEN_BACKEND: vLLM

View File

@@ -0,0 +1,33 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
vllm:
enabled: false
llm-uservice:
TEXTGEN_BACKEND: TGI

View File

@@ -2,32 +2,26 @@
# SPDX-License-Identifier: Apache-2.0
tgi:
enabled: false
vllm:
enabled: true
accelDevice: "gaudi"
LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
repository: opea/vllm-gaudi
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
startupProbe:
failureThreshold: 360
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "1024"
MAX_TOTAL_TOKENS: "2048"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
extraCmdArgs: [
"--tensor-parallel-size", "1",
"--block-size", "128",
"--max-num-seqs", "256",
]
llm-uservice:
TEXTGEN_BACKEND: vLLM
retryTimeoutSeconds: 720