Enable vllm for DocSum (#1716)

Set vllm as default llm serving, and add related docker compose files, readmes, and test scripts. Fix issue #1436 Signed-off-by: letonghan <letong.han@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-03-28 17:15:01 +08:00
parent 87baeb833d
commit d4dcbd18ef
12 changed files with 1403 additions and 317 deletions
--- a/DocSum/docker_compose/intel/hpu/gaudi/README.md
+++ b/DocSum/docker_compose/intel/hpu/gaudi/README.md
@@ -2,6 +2,8 @@

 This document outlines the deployment process for a Document Summarization application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `llm`. We will publish the Docker images to Docker Hub soon, which will simplify the deployment process for this service.

+The default pipeline deploys with vLLM as the LLM serving component. It also provides options of using TGI backend for LLM microservice, please refer to [start-microservice-docker-containers](#start-microservice-docker-containers) section in this page.
+
 ## 🚀 Build Docker Images

 ### 1. Build MicroService Docker Image
@@ -108,9 +110,20 @@ To set up environment variables for deploying Document Summarization services, f

 ```bash
 cd GenAIExamples/DocSum/docker_compose/intel/hpu/gaudi
+```
+
+If use vLLM as the LLM serving backend.
+
+```bash
 docker compose -f compose.yaml up -d
 ```

+If use TGI as the LLM serving backend.
+
+```bash
+docker compose -f compose_tgi.yaml up -d
+```
+
 You will have the following Docker Images:

 1. `opea/docsum-ui:latest`
@@ -120,10 +133,30 @@ You will have the following Docker Images:

 ### Validate Microservices

-1. TGI Service
+1. LLM backend Service
+
+   In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
+   Try the command below to check whether the LLM serving is ready.

   ```bash
-   curl http://${host_ip}:8008/generate \
+   # vLLM service
+   docker logs docsum-xeon-vllm-service 2>&1 | grep complete
+   # If the service is ready, you will get the response like below.
+   INFO:     Application startup complete.
+   ```
+
+   ```bash
+   # TGI service
+   docker logs docsum-xeon-tgi-service | grep Connected
+   # If the service is ready, you will get the response like below.
+   2024-09-03T02:47:53.402023Z  INFO text_generation_router::server: router/src/server.rs:2311: Connected
+   ```
+
+   Then try the `cURL` command below to validate services.
+
+   ```bash
+   # either vLLM or TGI service
+   curl http://${host_ip}:8008/v1/chat/completions \
     -X POST \
     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
     -H 'Content-Type: application/json'
--- a/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -2,47 +2,42 @@
 # SPDX-License-Identifier: Apache-2.0

 services:
-  tgi-gaudi-server:
-    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
-    container_name: tgi-gaudi-server
+  vllm-service:
+    image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
+    container_name: docsum-gaudi-vllm-service
    ports:
-      - ${LLM_ENDPOINT_PORT:-8008}:80
+      - "8008:80"
    volumes:
-      - "${DATA_PATH:-./data}:/data"
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
-      host_ip: ${host_ip}
-      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      NUM_CARDS: ${NUM_CARDS}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
    runtime: habana
    cap_add:
      - SYS_NICE
    ipc: host
-    healthcheck:
-      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
-      interval: 10s
-      timeout: 10s
-      retries: 100
-    command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
+    command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE}

-  llm-docsum-tgi:
+  llm-docsum-vllm:
    image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
-    container_name: llm-docsum-gaudi-server
+    container_name: docsum-gaudi-llm-server
    depends_on:
-      tgi-gaudi-server:
+      vllm-service:
        condition: service_healthy
    ports:
-      - ${DOCSUM_PORT:-9000}:9000
+      - ${LLM_PORT:-9000}:9000
    ipc: host
    environment:
      no_proxy: ${no_proxy}
@@ -59,7 +54,7 @@ services:

  whisper:
    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
-    container_name: whisper-server
+    container_name: docsum-gaudi-whisper-server
    ports:
      - "7066:7066"
    ipc: host
@@ -78,10 +73,10 @@ services:
    image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
    container_name: docsum-gaudi-backend-server
    depends_on:
-      - tgi-gaudi-server
-      - llm-docsum-tgi
+      - vllm-service
+      - llm-docsum-vllm
    ports:
-      - "8888:8888"
+      - "${BACKEND_SERVICE_PORT:-8888}:8888"
    environment:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
@@ -99,7 +94,7 @@ services:
    depends_on:
      - docsum-gaudi-backend-server
    ports:
-      - "5173:5173"
+      - "${FRONTEND_SERVICE_PORT:-5173}:5173"
    environment:
      - no_proxy=${no_proxy}
      - https_proxy=${https_proxy}
--- a/DocSum/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
+++ b/DocSum/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -0,0 +1,114 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  tgi-gaudi-server:
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
+    container_name: docsum-gaudi-tgi-server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-8008}:80
+    volumes:
+      - "${MODEL_CACHE}:/data"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      ENABLE_HPU_GRAPH: true
+      LIMIT_HPU_GRAPH: true
+      USE_FLASH_ATTENTION: true
+      FLASH_ATTENTION_RECOMPUTE: true
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
+
+  llm-docsum-tgi:
+    image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+    container_name: docsum-gaudi-llm-server
+    depends_on:
+      tgi-gaudi-server:
+        condition: service_healthy
+    ports:
+      - ${LLM_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
+      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
+      LOGFLAG: ${LOGFLAG:-False}
+    restart: unless-stopped
+
+  whisper:
+    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+    container_name: docsum-gaudi-whisper-server
+    ports:
+      - "7066:7066"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    restart: unless-stopped
+
+  docsum-gaudi-backend-server:
+    image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
+    container_name: docsum-gaudi-backend-server
+    depends_on:
+      - tgi-gaudi-server
+      - llm-docsum-tgi
+    ports:
+      - "${BACKEND_SERVICE_PORT:-8888}:8888"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+      - ASR_SERVICE_HOST_IP=${ASR_SERVICE_HOST_IP}
+
+    ipc: host
+    restart: always
+
+  docsum-gradio-ui:
+    image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
+    container_name: docsum-gaudi-ui-server
+    depends_on:
+      - docsum-gaudi-backend-server
+    ports:
+      - "${FRONTEND_SERVICE_PORT:-5173}:5173"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - BACKEND_SERVICE_ENDPOINT=${BACKEND_SERVICE_ENDPOINT}
+      - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge