Fix VLLM_CPU_KVCACHE_SPACE and wrong model id in tests

Signed-off-by: Yao, Qing <qing.yao@intel.com>
2025-05-15 16:41:22 +08:00
parent b02db2ad40
commit 99b3338649
5 changed files with 7 additions and 5 deletions
--- a/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
@@ -41,6 +41,7 @@ services:
      https_proxy: ${https_proxy}
      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      host_ip: ${host_ip}
+      VLLM_CPU_KVCACHE_SPACE: 40
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
      interval: 10s
--- a/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -52,6 +52,7 @@ services:
      VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
      NUM_CARDS: ${NUM_CARDS:-1}
      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      VLLM_CPU_KVCACHE_SPACE: 40
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
      interval: 10s
--- a/CodeGen/tests/test_compose_on_gaudi.sh
+++ b/CodeGen/tests/test_compose_on_gaudi.sh
@@ -49,7 +49,7 @@ function start_services() {
    local llm_container_name="$2"

    cd $WORKPATH/docker_compose
-    export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct"
+    export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-32B-Instruct"
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
    export host_ip=${ip_address}
    source set_env.sh
@@ -117,7 +117,7 @@ function validate_microservices() {
        "completion_tokens" \
        "llm-service" \
        "${llm_container_name}" \
-        '{"model": "Qwen/Qwen2.5-Coder-7B-Instruct", "messages": [{"role": "user", "content": "def print_hello_world():"}], "max_tokens": 256}'
+        '{"model": "Qwen/Qwen2.5-Coder-32B-Instruct", "messages": [{"role": "user", "content": "def print_hello_world():"}], "max_tokens": 256}'

    # llm microservice
    validate_services \
--- a/CodeGen/tests/test_compose_on_xeon.sh
+++ b/CodeGen/tests/test_compose_on_xeon.sh
@@ -51,7 +51,7 @@ function start_services() {
    local llm_container_name="$2"

    cd $WORKPATH/docker_compose
-    export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct"
+    export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-32B-Instruct"
    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
    source set_env.sh
    cd intel/cpu/xeon/
@@ -118,7 +118,7 @@ function validate_microservices() {
        "completion_tokens" \
        "llm-service" \
        "${llm_container_name}" \
-        '{"model": "Qwen/Qwen2.5-Coder-7B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 256}'
+        '{"model": "Qwen/Qwen2.5-Coder-32B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 256}'

    # llm microservice
    validate_services \
--- a/CodeGen/tests/test_compose_vllm_on_rocm.sh
+++ b/CodeGen/tests/test_compose_vllm_on_rocm.sh
@@ -93,7 +93,7 @@ function validate_microservices() {
        "content" \
        "codegen-vllm-service" \
        "codegen-vllm-service" \
-        '{"model": "Qwen/Qwen2.5-Coder-7B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
+        '{"model": "Qwen/Qwen2.5-Coder-32B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens": 17}'
    sleep 10
    # llm microservice
    validate_services \