36 lines
1.0 KiB
YAML
36 lines
1.0 KiB
YAML
# Copyright (C) 2024 Intel Corporation
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
# Accelerate inferencing in heaviest components to improve performance
|
|
# by overriding their subchart values
|
|
|
|
tgi:
|
|
enabled: false
|
|
vllm:
|
|
enabled: true
|
|
accelDevice: "gaudi"
|
|
image:
|
|
repository: opea/vllm-gaudi
|
|
resources:
|
|
limits:
|
|
habana.ai/gaudi: 4
|
|
LLM_MODEL_ID: "meta-llama/Llama-3.3-70B-Instruct"
|
|
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
|
PT_HPU_ENABLE_LAZY_COLLECTIVES: true
|
|
VLLM_SKIP_WARMUP: true
|
|
shmSize: 16Gi
|
|
extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq-len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
|
|
|
|
supervisor:
|
|
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
|
llm_engine: vllm
|
|
model: "meta-llama/Llama-3.3-70B-Instruct"
|
|
ragagent:
|
|
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
|
llm_engine: vllm
|
|
model: "meta-llama/Llama-3.3-70B-Instruct"
|
|
sqlagent:
|
|
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
|
llm_engine: vllm
|
|
model: "meta-llama/Llama-3.3-70B-Instruct"
|