Update vLLM parameter max-seq-len-to-capture (#1809)
Signed-off-by: lvliang-intel <liang1.lv@intel.com>
This commit is contained in:
@@ -6,7 +6,7 @@ tgi:
|
|||||||
vllm:
|
vllm:
|
||||||
enabled: true
|
enabled: true
|
||||||
LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
|
LLM_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
extraCmdArgs: ["--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
|
extraCmdArgs: ["--max-seq-len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
|
||||||
|
|
||||||
supervisor:
|
supervisor:
|
||||||
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ vllm:
|
|||||||
PT_HPU_ENABLE_LAZY_COLLECTIVES: true
|
PT_HPU_ENABLE_LAZY_COLLECTIVES: true
|
||||||
VLLM_SKIP_WARMUP: true
|
VLLM_SKIP_WARMUP: true
|
||||||
shmSize: 16Gi
|
shmSize: 16Gi
|
||||||
extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
|
extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq-len-to-capture", "16384", "--enable-auto-tool-choice", "--tool-call-parser", "llama3_json"]
|
||||||
|
|
||||||
supervisor:
|
supervisor:
|
||||||
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
llm_endpoint_url: http://{{ .Release.Name }}-vllm
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ services:
|
|||||||
cap_add:
|
cap_add:
|
||||||
- SYS_NICE
|
- SYS_NICE
|
||||||
ipc: host
|
ipc: host
|
||||||
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
|
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq-len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
|
||||||
audioqna-gaudi-backend-server:
|
audioqna-gaudi-backend-server:
|
||||||
image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
|
image: ${REGISTRY:-opea}/audioqna:${TAG:-latest}
|
||||||
container_name: audioqna-gaudi-backend-server
|
container_name: audioqna-gaudi-backend-server
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ vllm:
|
|||||||
"--tensor-parallel-size", "1",
|
"--tensor-parallel-size", "1",
|
||||||
"--block-size", "128",
|
"--block-size", "128",
|
||||||
"--max-num-seqs", "256",
|
"--max-num-seqs", "256",
|
||||||
"--max-seq_len-to-capture", "2048"
|
"--max-seq-len-to-capture", "2048"
|
||||||
]
|
]
|
||||||
|
|
||||||
whisper:
|
whisper:
|
||||||
|
|||||||
@@ -110,7 +110,7 @@ services:
|
|||||||
cap_add:
|
cap_add:
|
||||||
- SYS_NICE
|
- SYS_NICE
|
||||||
ipc: host
|
ipc: host
|
||||||
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq-len-to-capture 2048
|
||||||
chatqna-gaudi-backend-server:
|
chatqna-gaudi-backend-server:
|
||||||
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
|
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
|
||||||
container_name: chatqna-gaudi-backend-server
|
container_name: chatqna-gaudi-backend-server
|
||||||
|
|||||||
@@ -108,7 +108,7 @@ services:
|
|||||||
cap_add:
|
cap_add:
|
||||||
- SYS_NICE
|
- SYS_NICE
|
||||||
ipc: host
|
ipc: host
|
||||||
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq-len-to-capture 2048
|
||||||
llm-faqgen:
|
llm-faqgen:
|
||||||
image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
|
image: ${REGISTRY:-opea}/llm-faqgen:${TAG:-latest}
|
||||||
container_name: llm-faqgen-server
|
container_name: llm-faqgen-server
|
||||||
|
|||||||
@@ -139,7 +139,7 @@ services:
|
|||||||
cap_add:
|
cap_add:
|
||||||
- SYS_NICE
|
- SYS_NICE
|
||||||
ipc: host
|
ipc: host
|
||||||
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq-len-to-capture 2048
|
||||||
chatqna-gaudi-backend-server:
|
chatqna-gaudi-backend-server:
|
||||||
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
|
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
|
||||||
container_name: chatqna-gaudi-guardrails-server
|
container_name: chatqna-gaudi-guardrails-server
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ services:
|
|||||||
cap_add:
|
cap_add:
|
||||||
- SYS_NICE
|
- SYS_NICE
|
||||||
ipc: host
|
ipc: host
|
||||||
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
|
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq-len-to-capture 2048
|
||||||
chatqna-gaudi-backend-server:
|
chatqna-gaudi-backend-server:
|
||||||
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
|
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
|
||||||
container_name: chatqna-gaudi-backend-server
|
container_name: chatqna-gaudi-backend-server
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ vllm:
|
|||||||
"--tensor-parallel-size", "1",
|
"--tensor-parallel-size", "1",
|
||||||
"--block-size", "128",
|
"--block-size", "128",
|
||||||
"--max-num-seqs", "256",
|
"--max-num-seqs", "256",
|
||||||
"--max-seq_len-to-capture", "2048"
|
"--max-seq-len-to-capture", "2048"
|
||||||
]
|
]
|
||||||
|
|
||||||
# Reranking: second largest bottleneck when reranking is in use
|
# Reranking: second largest bottleneck when reranking is in use
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ vllm:
|
|||||||
"--tensor-parallel-size", "1",
|
"--tensor-parallel-size", "1",
|
||||||
"--block-size", "128",
|
"--block-size", "128",
|
||||||
"--max-num-seqs", "256",
|
"--max-num-seqs", "256",
|
||||||
"--max-seq_len-to-capture", "2048"
|
"--max-seq-len-to-capture", "2048"
|
||||||
]
|
]
|
||||||
|
|
||||||
# Reranking: second largest bottleneck when reranking is in use
|
# Reranking: second largest bottleneck when reranking is in use
|
||||||
|
|||||||
@@ -90,5 +90,5 @@ vllm:
|
|||||||
"--tensor-parallel-size", "1",
|
"--tensor-parallel-size", "1",
|
||||||
"--block-size", "128",
|
"--block-size", "128",
|
||||||
"--max-num-seqs", "256",
|
"--max-num-seqs", "256",
|
||||||
"--max-seq_len-to-capture", "2048"
|
"--max-seq-len-to-capture", "2048"
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ services:
|
|||||||
cap_add:
|
cap_add:
|
||||||
- SYS_NICE
|
- SYS_NICE
|
||||||
ipc: host
|
ipc: host
|
||||||
command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
|
command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq-len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
|
||||||
llm:
|
llm:
|
||||||
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
|
image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest}
|
||||||
container_name: codetrans-xeon-llm-server
|
container_name: codetrans-xeon-llm-server
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ vllm:
|
|||||||
"--tensor-parallel-size", "1",
|
"--tensor-parallel-size", "1",
|
||||||
"--block-size", "128",
|
"--block-size", "128",
|
||||||
"--max-num-seqs", "256",
|
"--max-num-seqs", "256",
|
||||||
"--max-seq_len-to-capture", "2048"
|
"--max-seq-len-to-capture", "2048"
|
||||||
]
|
]
|
||||||
|
|
||||||
llm-uservice:
|
llm-uservice:
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ services:
|
|||||||
cap_add:
|
cap_add:
|
||||||
- SYS_NICE
|
- SYS_NICE
|
||||||
ipc: host
|
ipc: host
|
||||||
command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
|
command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq-len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}
|
||||||
|
|
||||||
llm-docsum-vllm:
|
llm-docsum-vllm:
|
||||||
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
|
image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
|
||||||
|
|||||||
@@ -28,5 +28,5 @@ vllm:
|
|||||||
"--tensor-parallel-size", "1",
|
"--tensor-parallel-size", "1",
|
||||||
"--block-size", "128",
|
"--block-size", "128",
|
||||||
"--max-num-seqs", "256",
|
"--max-num-seqs", "256",
|
||||||
"--max-seq_len-to-capture", "2048"
|
"--max-seq-len-to-capture", "2048"
|
||||||
]
|
]
|
||||||
|
|||||||
Reference in New Issue
Block a user