Fix VLLM_CPU_KVCACHE_SPACE and wrong model id in tests
Signed-off-by: Yao, Qing <qing.yao@intel.com>
This commit is contained in:
@@ -41,6 +41,7 @@ services:
|
||||
https_proxy: ${https_proxy}
|
||||
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
host_ip: ${host_ip}
|
||||
VLLM_CPU_KVCACHE_SPACE: 40
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
|
||||
interval: 10s
|
||||
|
||||
@@ -52,6 +52,7 @@ services:
|
||||
VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
|
||||
NUM_CARDS: ${NUM_CARDS:-1}
|
||||
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
||||
VLLM_CPU_KVCACHE_SPACE: 40
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
|
||||
interval: 10s
|
||||
|
||||
@@ -49,7 +49,7 @@ function start_services() {
|
||||
local llm_container_name="$2"
|
||||
|
||||
cd $WORKPATH/docker_compose
|
||||
export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct"
|
||||
export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-32B-Instruct"
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
export host_ip=${ip_address}
|
||||
source set_env.sh
|
||||
@@ -117,7 +117,7 @@ function validate_microservices() {
|
||||
"completion_tokens" \
|
||||
"llm-service" \
|
||||
"${llm_container_name}" \
|
||||
'{"model": "Qwen/Qwen2.5-Coder-7B-Instruct", "messages": [{"role": "user", "content": "def print_hello_world():"}], "max_tokens": 256}'
|
||||
'{"model": "Qwen/Qwen2.5-Coder-32B-Instruct", "messages": [{"role": "user", "content": "def print_hello_world():"}], "max_tokens": 256}'
|
||||
|
||||
# llm microservice
|
||||
validate_services \
|
||||
|
||||
@@ -51,7 +51,7 @@ function start_services() {
|
||||
local llm_container_name="$2"
|
||||
|
||||
cd $WORKPATH/docker_compose
|
||||
export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct"
|
||||
export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-32B-Instruct"
|
||||
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||
source set_env.sh
|
||||
cd intel/cpu/xeon/
|
||||
@@ -118,7 +118,7 @@ function validate_microservices() {
|
||||
"completion_tokens" \
|
||||
"llm-service" \
|
||||
"${llm_container_name}" \
|
||||
'{"model": "Qwen/Qwen2.5-Coder-7B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 256}'
|
||||
'{"model": "Qwen/Qwen2.5-Coder-32B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 256}'
|
||||
|
||||
# llm microservice
|
||||
validate_services \
|
||||
|
||||
@@ -93,7 +93,7 @@ function validate_microservices() {
|
||||
"content" \
|
||||
"codegen-vllm-service" \
|
||||
"codegen-vllm-service" \
|
||||
'{"model": "Qwen/Qwen2.5-Coder-7B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
|
||||
'{"model": "Qwen/Qwen2.5-Coder-32B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
|
||||
sleep 10
|
||||
# llm microservice
|
||||
validate_services \
|
||||
|
||||
Reference in New Issue
Block a user