Refactor llm Docsum (#1101)

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
2025-01-13 15:24:43 +08:00
parent 3a7ccb0a75
commit 88f93733b0
29 changed files with 1196 additions and 962 deletions
--- a/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_tgi.yaml
@@ -0,0 +1,52 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+  tgi-service:
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    container_name: tgi-server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-8008}:80
+    volumes:
+      - "./data:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
+  llm:
+    image: opea/llm-docsum:latest
+    container_name: llm-docsum-server
+    depends_on:
+      tgi-service:
+        condition: service_healthy
+    ports:
+      - ${DOCSUM_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
+      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
+      LOGFLAG: ${LOGFLAG:-False}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
--- a/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_tgi_on_intel_hpu.yaml
@@ -0,0 +1,63 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+  tgi-service:
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
+    container_name: tgi_gaudi_server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-8008}:80
+    volumes:
+      - "./data:/data"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      ENABLE_HPU_GRAPH: true
+      LIMIT_HPU_GRAPH: true
+      USE_FLASH_ATTENTION: true
+      FLASH_ATTENTION_RECOMPUTE: true
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
+  llm:
+    image: opea/llm-docsum:latest
+    container_name: llm-docsum-server
+    depends_on:
+      tgi-service:
+        condition: service_healthy
+    ports:
+      - ${DOCSUM_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
+      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
+      LOGFLAG: ${LOGFLAG:-False}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
--- a/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_vllm.yaml
@@ -0,0 +1,55 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+  vllm-service:
+    image: opea/vllm:latest
+    container_name: vllm-server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-8008}:80
+    volumes:
+      - "./data:/data"
+    shm_size: 128g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+      VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
+  llm:
+    image: opea/llm-docsum:latest
+    container_name: llm-docsum-server
+    depends_on:
+      vllm-service:
+        condition: service_healthy
+    ports:
+      - ${DOCSUM_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
+      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
+      LOGFLAG: ${LOGFLAG:-False}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
--- a/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml
+++ b/comps/llms/deployment/docker_compose/doc-summarization_vllm_on_intel_hpu.yaml
@@ -8,37 +8,52 @@ services:
    image: opea/vllm-gaudi:latest
    container_name: vllm-gaudi-server
    ports:
-      - "8008:80"
+      - ${LLM_ENDPOINT_PORT:-8008}:80
    volumes:
      - "./data:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HF_TOKEN: ${HF_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+      VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
    runtime: habana
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq-len-to-capture ${MAX_INPUT_TOKENS}
  llm:
-    image: opea/llm-docsum-vllm:latest
-    container_name: llm-docsum-vllm-server
+    image: opea/llm-docsum:latest
+    container_name: llm-docsum-server
+    depends_on:
+      vllm-service:
+        condition: service_healthy
    ports:
-      - "9000:9000"
+      - ${DOCSUM_PORT:-9000}:9000
    ipc: host
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      vLLM_ENDPOINT: ${vLLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
+      LOGFLAG: ${LOGFLAG:-False}
    restart: unless-stopped

 networks:
--- a/comps/llms/summarization/tgi/langchain/Dockerfile
+++ b/comps/llms/summarization/tgi/langchain/Dockerfile
@@ -19,10 +19,10 @@ COPY comps /home/user/comps

 RUN pip install --no-cache-dir --upgrade pip setuptools && \
    if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
-    pip install --no-cache-dir -r /home/user/comps/llms/summarization/tgi/langchain/requirements.txt
+    pip install --no-cache-dir -r /home/user/comps/llms/src/doc-summarization/requirements.txt

 ENV PYTHONPATH=$PYTHONPATH:/home/user

-WORKDIR /home/user/comps/llms/summarization/tgi/langchain
+WORKDIR /home/user/comps/llms/src/doc-summarization

 ENTRYPOINT ["bash", "entrypoint.sh"]
--- a/comps/llms/summarization/tgi/langchain/README.md
+++ b/comps/llms/summarization/tgi/langchain/README.md
@@ -1,66 +1,44 @@
-# Document Summary TGI Microservice
+# Document Summary LLM Microservice

-This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors.
-[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more.
+This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors. You can set backend service either [TGI](../../../third_parties/tgi) or [vLLM](../../../third_parties/vllm).

-## 🚀1. Start Microservice with Python 🐍 (Option 1)
+## 🚀1. Start Microservice with Docker 🐳

-To start the LLM microservice, you need to install python packages first.
+### 1.1 Setup Environment Variables

-### 1.1 Install Requirements
+In order to start DocSum services, you need to setup the following environment variables first.

 ```bash
-pip install -r requirements.txt
-```
-
-### 1.2 Start LLM Service
-
-```bash
-export HF_TOKEN=${your_hf_api_token}
-docker run -p 8008:80 -v ./data:/data --name llm-docsum-tgi --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.1.0 --model-id ${your_hf_llm_model}
-```
-
-### 1.3 Verify the TGI Service
-
-```bash
-curl http://${your_ip}:8008/v1/chat/completions \
-     -X POST \
-     -d '{"model": ${your_hf_llm_model}, "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
-     -H 'Content-Type: application/json'
-```
-
-### 1.4 Start LLM Service with Python Script
-
-```bash
-export TGI_LLM_ENDPOINT="http://${your_ip}:8008"
-python llm.py
-```
-
-## 🚀2. Start Microservice with Docker 🐳 (Option 2)
-
-If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a TGI/vLLM service with docker.
-
-### 2.1 Setup Environment Variables
-
-In order to start TGI and LLM services, you need to setup the following environment variables first.
-
-```bash
-export HF_TOKEN=${your_hf_api_token}
-export TGI_LLM_ENDPOINT="http://${your_ip}:8008"
+export host_ip=${your_host_ip}
+export LLM_ENDPOINT_PORT=8008
+export DOCSUM_PORT=9000
+export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
 export LLM_MODEL_ID=${your_hf_llm_model}
 export MAX_INPUT_TOKENS=2048
 export MAX_TOTAL_TOKENS=4096
+export DocSum_COMPONENT_NAME="OPEADocSum_TGI" # or "OPEADocSum_vLLM"
 ```

 Please make sure MAX_TOTAL_TOKENS should be larger than (MAX_INPUT_TOKENS + max_new_tokens + 50), 50 is reserved prompt length.

-### 2.2 Build Docker Image
+### 1.2 Build Docker Image
+
+Step 1: Prepare backend LLM docker image.
+
+If you want to use vLLM backend, refer to [vLLM](../../../third_parties/vllm/src) to build vLLM docker images first.
+
+No need for TGI.
+
+Step 2: Build FaqGen docker image.

 ```bash
-cd ../../../../../
-docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/summarization/tgi/langchain/Dockerfile .
+cd ../../../../
+docker build -t opea/llm-docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/summarization/Dockerfile .
 ```

+### 1.3 Run Docker
+
 To start a docker container, you have two options:

 - A. Run Docker with CLI
@@ -68,16 +46,45 @@ To start a docker container, you have two options:

 You can choose one as needed.

-### 2.3 Run Docker with CLI (Option A)
+### 1.3.1 Run Docker with CLI (Option A)
+
+Step 1: Start the backend LLM service
+Please refer to [TGI](../../../third_parties/tgi/deployment/docker_compose/) or [vLLM](../../../third_parties/vllm/deployment/docker_compose/) guideline to start a backend LLM service.
+
+Step 2: Start the DocSum microservices

 ```bash
-docker run -d --name="llm-docsum-tgi-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HF_TOKEN=$HF_TOKEN -e MAX_INPUT_TOKENS=${MAX_INPUT_TOKENS} -e MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} opea/llm-docsum-tgi:latest
+docker run -d \
+    --name="llm-docsum-server" \
+    -p 9000:9000 \
+    --ipc=host \
+    -e http_proxy=$http_proxy \
+    -e https_proxy=$https_proxy \
+    -e LLM_MODEL_ID=$LLM_MODEL_ID \
+    -e LLM_ENDPOINT=$LLM_ENDPOINT \
+    -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN \
+    -e DocSum_COMPONENT_NAME=$DocSum_COMPONENT_NAME \
+    -e MAX_INPUT_TOKENS=${MAX_INPUT_TOKENS} \
+    -e MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} \
+    opea/llm-docsum:latest
 ```

-### 2.4 Run Docker with Docker Compose (Option B)
+### 1.3.2 Run Docker with Docker Compose (Option B)

 ```bash
-docker compose -f docker_compose_llm.yaml up -d
+cd ../../deployment/docker_compose/
+
+# Backend is TGI on xeon
+docker compose -f doc-summarization_tgi.yaml up -d
+
+# Backend is TGI on gaudi
+# docker compose -f doc-summarization_tgi_on_intel_hpu.yaml up -d
+
+# Backend is vLLM on xeon
+# docker compose -f doc-summarization_vllm.yaml up -d
+
+# Backend is vLLM on gaudi
+# docker compose -f doc-summarization_vllm_on_intel_hpu.yaml up -d
 ```

 ## 🚀3. Consume LLM Service
@@ -106,19 +113,19 @@ If you want to deal with long context, can select suitable summary type, details

 ```bash
 # Enable stream to receive a stream response. By default, this is set to True.
-curl http://${your_ip}:9000/v1/chat/docsum \
+curl http://${your_ip}:9000/v1/docsum \
  -X POST \
  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \
  -H 'Content-Type: application/json'

 # Disable stream to receive a non-stream response.
-curl http://${your_ip}:9000/v1/chat/docsum \
+curl http://${your_ip}:9000/v1/docsum \
  -X POST \
  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' \
  -H 'Content-Type: application/json'

 # Use Chinese mode
-curl http://${your_ip}:9000/v1/chat/docsum \
+curl http://${your_ip}:9000/v1/docsum \
  -X POST \
  -d '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' \
  -H 'Content-Type: application/json'
@@ -139,7 +146,7 @@ In this mode LLM generate summary based on complete input text. In this case ple
 Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`

 ```bash
-curl http://${your_ip}:9000/v1/chat/docsum \
+curl http://${your_ip}:9000/v1/docsum \
  -X POST \
  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}' \
  -H 'Content-Type: application/json'
@@ -152,7 +159,7 @@ Map_reduce mode will split the inputs into multiple chunks, map each document to
 In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`

 ```bash
-curl http://${your_ip}:9000/v1/chat/docsum \
+curl http://${your_ip}:9000/v1/docsum \
  -X POST \
  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
  -H 'Content-Type: application/json'
@@ -165,7 +172,7 @@ Refin mode will split the inputs into multiple chunks, generate summary for the
 In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`.

 ```bash
-curl http://${your_ip}:9000/v1/chat/docsum \
+curl http://${your_ip}:9000/v1/docsum \
  -X POST \
  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' \
  -H 'Content-Type: application/json'
--- a/comps/llms/summarization/vllm/langchain/entrypoint.sh
+++ b/comps/llms/summarization/vllm/langchain/entrypoint.sh
@@ -5,4 +5,4 @@

 pip --no-cache-dir install -r requirements-runtime.txt

-python llm.py
+python opea_docsum_microservice.py
--- a/comps/llms/src/doc-summarization/integrations/init.py
+++ b/comps/llms/src/doc-summarization/integrations/init.py
--- a/comps/llms/src/doc-summarization/integrations/common.py
+++ b/comps/llms/src/doc-summarization/integrations/common.py
@@ -0,0 +1,204 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import requests
+from fastapi.responses import StreamingResponse
+from langchain.chains.summarize import load_summarize_chain
+from langchain.docstore.document import Document
+from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
+from langchain_core.prompts import PromptTemplate
+from transformers import AutoTokenizer
+
+from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, ServiceType
+from comps.cores.mega.utils import ConfigError, get_access_token, load_model_configs
+
+from .template import templ_en, templ_refine_en, templ_refine_zh, templ_zh
+
+logger = CustomLogger("llm_docsum")
+logflag = os.getenv("LOGFLAG", False)
+
+# Environment variables
+MODEL_NAME = os.getenv("LLM_MODEL_ID")
+MODEL_CONFIGS = os.getenv("MODEL_CONFIGS")
+TOKEN_URL = os.getenv("TOKEN_URL")
+CLIENTID = os.getenv("CLIENTID")
+CLIENT_SECRET = os.getenv("CLIENT_SECRET")
+MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", 2048))
+MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS", 4096))
+
+if os.getenv("LLM_ENDPOINT") is not None:
+    DEFAULT_ENDPOINT = os.getenv("LLM_ENDPOINT")
+elif os.getenv("TGI_LLM_ENDPOINT") is not None:
+    DEFAULT_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT")
+elif os.getenv("vLLM_ENDPOINT") is not None:
+    DEFAULT_ENDPOINT = os.getenv("vLLM_ENDPOINT")
+else:
+    DEFAULT_ENDPOINT = "http://localhost:8080"
+
+
+def get_llm_endpoint():
+    if not MODEL_CONFIGS:
+        return DEFAULT_ENDPOINT
+    else:
+        # Validate and Load the models config if MODEL_CONFIGS is not null
+        configs_map = {}
+        try:
+            configs_map = load_model_configs(MODEL_CONFIGS)
+        except ConfigError as e:
+            logger.error(f"Failed to load model configurations: {e}")
+            raise ConfigError(f"Failed to load model configurations: {e}")
+        try:
+            return configs_map.get(MODEL_NAME).get("endpoint")
+        except ConfigError as e:
+            logger.error(f"Input model {MODEL_NAME} not present in model_configs. Error {e}")
+            raise ConfigError(f"Input model {MODEL_NAME} not present in model_configs")
+
+
+class OPEADocSum(OpeaComponent):
+    """A specialized OPEA DocSum component derived from OpeaComponent.
+
+    Attributes:
+        client (TGI/vLLM): An instance of the TGI/vLLM client for text generation.
+    """
+
+    def __init__(self, name: str, description: str, config: dict = None):
+        super().__init__(name, ServiceType.LLM.name.lower(), description, config)
+        self.access_token = (
+            get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
+        )
+        self.llm_endpoint = get_llm_endpoint()
+        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        health_status = self.check_health()
+        if not health_status:
+            logger.error("OPEADocSum health check failed.")
+
+    async def generate(self, input: DocSumLLMParams, client):
+        """Invokes the TGI/vLLM LLM service to generate summarization for the provided input.
+
+        Args:
+            input (DocSumLLMParams): The input text(s).
+            client: TGI/vLLM based client
+        """
+        ### check summary type
+        summary_types = ["auto", "stuff", "truncate", "map_reduce", "refine"]
+        if input.summary_type not in summary_types:
+            raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
+        if input.summary_type == "auto":  ### Check input token length in auto mode
+            token_len = len(self.tokenizer.encode(input.query))
+            if token_len > MAX_INPUT_TOKENS + 50:
+                input.summary_type = "refine"
+                if logflag:
+                    logger.info(
+                        f"Input token length {token_len} exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'refine' mode."
+                    )
+            else:
+                input.summary_type = "stuff"
+                if logflag:
+                    logger.info(
+                        f"Input token length {token_len} not exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'stuff' mode."
+                    )
+
+        ### Check input language
+        if input.language in ["en", "auto"]:
+            templ = templ_en
+            templ_refine = templ_refine_en
+        elif input.language in ["zh"]:
+            templ = templ_zh
+            templ_refine = templ_refine_zh
+        else:
+            raise NotImplementedError('Please specify the input language in "en", "zh", "auto"')
+
+        ## Prompt
+        PROMPT = PromptTemplate.from_template(templ)
+        if input.summary_type == "refine":
+            PROMPT_REFINE = PromptTemplate.from_template(templ_refine)
+        if logflag:
+            logger.info("After prompting:")
+            logger.info(PROMPT)
+            if input.summary_type == "refine":
+                logger.info(PROMPT_REFINE)
+
+        ## Split text
+        if input.summary_type == "stuff":
+            text_splitter = CharacterTextSplitter()
+        else:
+            if input.summary_type == "refine":
+                if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128:  ## 128 is reserved prompt length
+                    raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)")
+                max_input_tokens = min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)
+            else:
+                if MAX_TOTAL_TOKENS <= input.max_tokens + 50:  # 50 is reserved token length for prompt
+                    raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)")
+                max_input_tokens = min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)
+            chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens
+            chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size)
+            text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
+                tokenizer=self.tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
+            )
+            if logflag:
+                logger.info(f"set chunk size to: {chunk_size}")
+                logger.info(f"set chunk overlap to: {chunk_overlap}")
+
+        texts = text_splitter.split_text(input.query)
+        docs = [Document(page_content=t) for t in texts]
+        if logflag:
+            logger.info(f"Split input query into {len(docs)} chunks")
+            logger.info(f"The character length of the first chunk is {len(texts[0])}")
+
+        ## LLM chain
+        summary_type = input.summary_type
+        if summary_type == "stuff":
+            llm_chain = load_summarize_chain(llm=client, prompt=PROMPT)
+        elif summary_type == "truncate":
+            docs = [docs[0]]
+            llm_chain = load_summarize_chain(llm=client, prompt=PROMPT)
+        elif summary_type == "map_reduce":
+            llm_chain = load_summarize_chain(
+                llm=client,
+                map_prompt=PROMPT,
+                combine_prompt=PROMPT,
+                chain_type="map_reduce",
+                return_intermediate_steps=True,
+            )
+        elif summary_type == "refine":
+            llm_chain = load_summarize_chain(
+                llm=client,
+                question_prompt=PROMPT,
+                refine_prompt=PROMPT_REFINE,
+                chain_type="refine",
+                return_intermediate_steps=True,
+            )
+        else:
+            raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
+
+        if input.stream:
+
+            async def stream_generator():
+                from langserve.serialization import WellKnownLCSerializer
+
+                _serializer = WellKnownLCSerializer()
+                async for chunk in llm_chain.astream_log(docs):
+                    data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
+                    if logflag:
+                        logger.info(data)
+                    yield f"data: {data}\n\n"
+                yield "data: [DONE]\n\n"
+
+            return StreamingResponse(stream_generator(), media_type="text/event-stream")
+        else:
+            response = await llm_chain.ainvoke(docs)
+
+            if input.summary_type in ["map_reduce", "refine"]:
+                intermediate_steps = response["intermediate_steps"]
+                if logflag:
+                    logger.info("intermediate_steps:")
+                    logger.info(intermediate_steps)
+
+            output_text = response["output_text"]
+            if logflag:
+                logger.info("\n\noutput_text:")
+                logger.info(output_text)
+
+            return GeneratedDoc(text=output_text, prompt=input.query)
--- a/comps/llms/src/doc-summarization/integrations/template.py
+++ b/comps/llms/src/doc-summarization/integrations/template.py
@@ -0,0 +1,58 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+templ_en = """Write a concise summary of the following:
+
+
+"{text}"
+
+
+CONCISE SUMMARY:"""
+
+templ_zh = """请简要概括以下内容:
+
+
+"{text}"
+
+
+概况:"""
+
+
+templ_refine_en = """Your job is to produce a final summary.
+We have provided an existing summary up to a certain point, then we will provide more context.
+You need to refine the existing summary (only if needed) with new context and generate a final summary.
+
+
+Existing Summary:
+"{existing_answer}"
+
+
+
+New Context:
+"{text}"
+
+
+
+Final Summary:
+
+"""
+
+templ_refine_zh = """\
+你的任务是生成一个最终摘要。
+我们已经处理好部分文本并生成初始摘要, 并提供了新的未处理文本
+你需要根据新提供的文本，结合初始摘要，生成一个最终摘要。
+
+
+初始摘要:
+"{existing_answer}"
+
+
+
+新的文本:
+"{text}"
+
+
+
+最终摘要:
+
+"""
--- a/comps/llms/src/doc-summarization/integrations/tgi.py
+++ b/comps/llms/src/doc-summarization/integrations/tgi.py
@@ -0,0 +1,76 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import requests
+from langchain_community.llms import HuggingFaceEndpoint
+
+from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
+
+from .common import *
+
+logger = CustomLogger("llm_docsum_tgi")
+logflag = os.getenv("LOGFLAG", False)
+
+
+@OpeaComponentRegistry.register("OPEADocSum_TGI")
+class OPEADocSum_TGI(OPEADocSum):
+    """A specialized OPEA DocSum TGI component derived from OPEADocSum for interacting with TGI services based on Lanchain HuggingFaceEndpoint API.
+
+    Attributes:
+        client (TGI): An instance of the TGI client for text generation.
+    """
+
+    def check_health(self) -> bool:
+        """Checks the health of the TGI LLM service.
+
+        Returns:
+            bool: True if the service is reachable and healthy, False otherwise.
+        """
+
+        try:
+            # response = requests.get(f"{self.llm_endpoint}/health")
+
+            # Will remove after TGI gaudi fix health bug
+            url = f"{self.llm_endpoint}/generate"
+            data = {"inputs": "What is Deep Learning?", "parameters": {"max_new_tokens": 17}}
+            headers = {"Content-Type": "application/json"}
+            response = requests.post(url=url, json=data, headers=headers)
+
+            if response.status_code == 200:
+                return True
+            else:
+                return False
+        except Exception as e:
+            logger.error(e)
+            logger.error("Health check failed")
+            return False
+
+    async def invoke(self, input: DocSumLLMParams):
+        """Invokes the TGI LLM service to generate summarization output for the provided input.
+
+        Args:
+            input (DocSumLLMParams): The input text(s).
+        """
+        server_kwargs = {}
+        if self.access_token:
+            server_kwargs["headers"] = {"Authorization": f"Bearer {self.access_token}"}
+
+        if input.stream and input.summary_type == "map_reduce":
+            logger.info("Map Reduce mode don't support stream=True, set to stream=False")
+            input.stream = False
+        self.client = HuggingFaceEndpoint(
+            endpoint_url=self.llm_endpoint,
+            max_new_tokens=input.max_tokens,
+            top_k=input.top_k,
+            top_p=input.top_p,
+            typical_p=input.typical_p,
+            temperature=input.temperature,
+            repetition_penalty=input.repetition_penalty,
+            streaming=input.stream,
+            server_kwargs=server_kwargs,
+        )
+        result = await self.generate(input, self.client)
+
+        return result
--- a/comps/llms/src/doc-summarization/integrations/vllm.py
+++ b/comps/llms/src/doc-summarization/integrations/vllm.py
@@ -0,0 +1,69 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import requests
+from langchain_community.llms import VLLMOpenAI
+
+from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
+
+from .common import *
+
+logger = CustomLogger("llm_docsum_vllm")
+logflag = os.getenv("LOGFLAG", False)
+
+
+@OpeaComponentRegistry.register("OPEADocSum_vLLM")
+class OPEADocSum_vLLM(OPEADocSum):
+    """A specialized OPEA DocSum vLLM component derived from OPEADocSum for interacting with vLLM services based on Lanchain VLLMOpenAI API.
+
+    Attributes:
+        client (vLLM): An instance of the vLLM client for text generation.
+    """
+
+    def check_health(self) -> bool:
+        """Checks the health of the vLLM LLM service.
+
+        Returns:
+            bool: True if the service is reachable and healthy, False otherwise.
+        """
+
+        try:
+            response = requests.get(f"{self.llm_endpoint}/health")
+            if response.status_code == 200:
+                return True
+            else:
+                return False
+        except Exception as e:
+            logger.error(e)
+            logger.error("Health check failed")
+            return False
+
+    async def invoke(self, input: DocSumLLMParams):
+        """Invokes the vLLM LLM service to generate summarization output for the provided input.
+
+        Args:
+            input (DocSumLLMParams): The input text(s).
+        """
+        headers = {}
+        if self.access_token:
+            headers = {"Authorization": f"Bearer {self.access_token}"}
+
+        if input.stream and input.summary_type == "map_reduce":
+            logger.info("Map Reduce mode don't support stream=True, set to stream=False")
+            input.stream = False
+        self.client = VLLMOpenAI(
+            openai_api_key="EMPTY",
+            openai_api_base=self.llm_endpoint + "/v1",
+            model_name=MODEL_NAME,
+            default_headers=headers,
+            max_tokens=input.max_tokens,
+            top_p=input.top_p,
+            streaming=input.stream,
+            temperature=input.temperature,
+            presence_penalty=input.repetition_penalty,
+        )
+        result = await self.generate(input, self.client)
+
+        return result
--- a/comps/llms/src/doc-summarization/opea_docsum_microservice.py
+++ b/comps/llms/src/doc-summarization/opea_docsum_microservice.py
@@ -0,0 +1,58 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import time
+
+from integrations.tgi import OPEADocSum_TGI
+from integrations.vllm import OPEADocSum_vLLM
+
+from comps import (
+    CustomLogger,
+    DocSumLLMParams,
+    OpeaComponentLoader,
+    ServiceType,
+    opea_microservices,
+    register_microservice,
+    register_statistics,
+    statistics_dict,
+)
+
+logger = CustomLogger("llm_docsum")
+logflag = os.getenv("LOGFLAG", False)
+
+llm_component_name = os.getenv("DocSum_COMPONENT_NAME", "OPEADocSum_TGI")
+# Initialize OpeaComponentLoader
+loader = OpeaComponentLoader(llm_component_name, description=f"OPEA LLM DocSum Component: {llm_component_name}")
+
+
+@register_microservice(
+    name="opea_service@llm_docsum",
+    service_type=ServiceType.LLM,
+    endpoint="/v1/docsum",
+    host="0.0.0.0",
+    port=9000,
+)
+@register_statistics(names=["opea_service@llm_docsum"])
+async def llm_generate(input: DocSumLLMParams):
+    start = time.time()
+
+    # Log the input if logging is enabled
+    if logflag:
+        logger.info(input)
+
+    try:
+        # Use the controller to invoke the active component
+        response = await loader.invoke(input)
+        # Record statistics
+        statistics_dict["opea_service@llm_docsum"].append_latency(time.time() - start, None)
+        return response
+
+    except Exception as e:
+        logger.error(f"Error during DocSum invocation: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    logger.info("OPEA DocSum Microservice is starting...")
+    opea_microservices["opea_service@llm_docsum"].start()
--- a/comps/llms/summarization/tgi/langchain/requirements-runtime.txt
+++ b/comps/llms/summarization/tgi/langchain/requirements-runtime.txt
--- a/comps/llms/summarization/tgi/langchain/requirements.txt
+++ b/comps/llms/summarization/tgi/langchain/requirements.txt
--- a/comps/llms/summarization/tgi/langchain/docker_compose_llm.yaml
+++ b/comps/llms/summarization/tgi/langchain/docker_compose_llm.yaml
@@ -1,37 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-version: "3.8"
-
-services:
-  tgi_service:
-    image: ghcr.io/huggingface/text-generation-inference:2.1.0
-    container_name: tgi-service
-    ports:
-      - "8008:80"
-    volumes:
-      - "./data:/data"
-    environment:
-      HF_TOKEN: ${HF_TOKEN}
-    shm_size: 1g
-    command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
-  llm:
-    image: opea/llm-docsum-tgi:latest
-    container_name: llm-docsum-tgi-server
-    ports:
-      - "9000:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
-      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
-      LLM_MODEL_ID: ${LLM_MODEL_ID}
-    restart: unless-stopped
-
-networks:
-  default:
-    driver: bridge
--- a/comps/llms/summarization/tgi/langchain/entrypoint.sh
+++ b/comps/llms/summarization/tgi/langchain/entrypoint.sh
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-pip --no-cache-dir install -r requirements-runtime.txt
-
-python llm.py
--- a/comps/llms/summarization/tgi/langchain/llm.py
+++ b/comps/llms/summarization/tgi/langchain/llm.py
@@ -1,245 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-from fastapi.responses import StreamingResponse
-from langchain.chains.summarize import load_summarize_chain
-from langchain.docstore.document import Document
-from langchain.prompts import PromptTemplate
-from langchain_community.llms import HuggingFaceEndpoint
-from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
-from transformers import AutoTokenizer
-
-from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, ServiceType, opea_microservices, register_microservice
-from comps.cores.mega.utils import get_access_token
-
-logger = CustomLogger("llm_docsum")
-logflag = os.getenv("LOGFLAG", False)
-
-# Environment variables
-TOKEN_URL = os.getenv("TOKEN_URL")
-CLIENTID = os.getenv("CLIENTID")
-CLIENT_SECRET = os.getenv("CLIENT_SECRET")
-MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", 2048))
-MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS", 4096))
-LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
-
-templ_en = """Write a concise summary of the following:
-
-
-"{text}"
-
-
-CONCISE SUMMARY:"""
-
-templ_zh = """请简要概括以下内容:
-
-
-"{text}"
-
-
-概况:"""
-
-
-templ_refine_en = """Your job is to produce a final summary.
-We have provided an existing summary up to a certain point, then we will provide more context.
-You need to refine the existing summary (only if needed) with new context and generate a final summary.
-
-
-Existing Summary:
-"{existing_answer}"
-
-
-
-New Context:
-"{text}"
-
-
-
-Final Summary:
-
-"""
-
-templ_refine_zh = """\
-你的任务是生成一个最终摘要。
-我们已经处理好部分文本并生成初始摘要, 并提供了新的未处理文本
-你需要根据新提供的文本，结合初始摘要，生成一个最终摘要。
-
-
-初始摘要:
-"{existing_answer}"
-
-
-
-新的文本:
-"{text}"
-
-
-
-最终摘要:
-
-"""
-
-
-@register_microservice(
-    name="opea_service@llm_docsum",
-    service_type=ServiceType.LLM,
-    endpoint="/v1/chat/docsum",
-    host="0.0.0.0",
-    port=9000,
-)
-async def llm_generate(input: DocSumLLMParams):
-    if logflag:
-        logger.info(input)
-
-    ### check summary type
-    summary_types = ["auto", "stuff", "truncate", "map_reduce", "refine"]
-    if input.summary_type not in summary_types:
-        raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
-    if input.summary_type == "auto":  ### Check input token length in auto mode
-        token_len = len(tokenizer.encode(input.query))
-        if token_len > MAX_INPUT_TOKENS + 50:
-            input.summary_type = "refine"
-            if logflag:
-                logger.info(
-                    f"Input token length {token_len} exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'refine' mode."
-                )
-        else:
-            input.summary_type = "stuff"
-            if logflag:
-                logger.info(
-                    f"Input token length {token_len} not exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'stuff' mode."
-                )
-
-    if input.language in ["en", "auto"]:
-        templ = templ_en
-        templ_refine = templ_refine_en
-    elif input.language in ["zh"]:
-        templ = templ_zh
-        templ_refine = templ_refine_zh
-    else:
-        raise NotImplementedError('Please specify the input language in "en", "zh", "auto"')
-
-    ## Prompt
-    PROMPT = PromptTemplate.from_template(templ)
-    if input.summary_type == "refine":
-        PROMPT_REFINE = PromptTemplate.from_template(templ_refine)
-    if logflag:
-        logger.info("After prompting:")
-        logger.info(PROMPT)
-        if input.summary_type == "refine":
-            logger.info(PROMPT_REFINE)
-
-    ## Split text
-    if input.summary_type == "stuff":
-        text_splitter = CharacterTextSplitter()
-    else:
-        if input.summary_type == "refine":
-            if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128:
-                raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)")
-            max_input_tokens = min(
-                MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS
-            )  # 128 is reserved token length for prompt
-        else:
-            if MAX_TOTAL_TOKENS <= input.max_tokens + 50:
-                raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)")
-            max_input_tokens = min(
-                MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS
-            )  # 50 is reserved token length for prompt
-        chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens
-        chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size)
-        text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
-            tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
-        )
-        if logflag:
-            logger.info(f"set chunk size to: {chunk_size}")
-            logger.info(f"set chunk overlap to: {chunk_overlap}")
-
-    texts = text_splitter.split_text(input.query)
-    docs = [Document(page_content=t) for t in texts]
-    if logflag:
-        logger.info(f"Split input query into {len(docs)} chunks")
-        logger.info(f"The character length of the first chunk is {len(texts[0])}")
-
-    ## Access auth
-    access_token = (
-        get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
-    )
-    server_kwargs = {}
-    if access_token:
-        server_kwargs["headers"] = {"Authorization": f"Bearer {access_token}"}
-
-    ## LLM
-    if input.stream and input.summary_type == "map_reduce":
-        logger.info("Map Reduce mode don't support stream=True, set to stream=False")
-        input.stream = False
-    llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
-    llm = HuggingFaceEndpoint(
-        endpoint_url=llm_endpoint,
-        max_new_tokens=input.max_tokens,
-        top_k=input.top_k,
-        top_p=input.top_p,
-        typical_p=input.typical_p,
-        temperature=input.temperature,
-        repetition_penalty=input.repetition_penalty,
-        streaming=input.stream,
-        server_kwargs=server_kwargs,
-    )
-
-    ## LLM chain
-    summary_type = input.summary_type
-    if summary_type == "stuff":
-        llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
-    elif summary_type == "truncate":
-        docs = [docs[0]]
-        llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
-    elif summary_type == "map_reduce":
-        llm_chain = load_summarize_chain(
-            llm=llm, map_prompt=PROMPT, combine_prompt=PROMPT, chain_type="map_reduce", return_intermediate_steps=True
-        )
-    elif summary_type == "refine":
-        llm_chain = load_summarize_chain(
-            llm=llm,
-            question_prompt=PROMPT,
-            refine_prompt=PROMPT_REFINE,
-            chain_type="refine",
-            return_intermediate_steps=True,
-        )
-    else:
-        raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')
-
-    if input.stream:
-
-        async def stream_generator():
-            from langserve.serialization import WellKnownLCSerializer
-
-            _serializer = WellKnownLCSerializer()
-            async for chunk in llm_chain.astream_log(docs):
-                data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
-                if logflag:
-                    logger.info(data)
-                yield f"data: {data}\n\n"
-            yield "data: [DONE]\n\n"
-
-        return StreamingResponse(stream_generator(), media_type="text/event-stream")
-    else:
-        response = await llm_chain.ainvoke(docs)
-
-        if input.summary_type in ["map_reduce", "refine"]:
-            intermediate_steps = response["intermediate_steps"]
-            if logflag:
-                logger.info("intermediate_steps:")
-                logger.info(intermediate_steps)
-
-        output_text = response["output_text"]
-        if logflag:
-            logger.info("\n\noutput_text:")
-            logger.info(output_text)
-
-        return GeneratedDoc(text=output_text, prompt=input.query)
-
-
-if __name__ == "__main__":
-    tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
-    opea_microservices["opea_service@llm_docsum"].start()
--- a/comps/llms/summarization/vllm/langchain/Dockerfile
+++ b/comps/llms/summarization/vllm/langchain/Dockerfile
@@ -1,28 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-FROM python:3.11-slim
-
-ARG ARCH="cpu"
-
-RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
-    libgl1-mesa-glx \
-    libjemalloc-dev
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p /home/user && \
-    chown -R user /home/user/
-
-USER user
-
-COPY comps /home/user/comps
-
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
-    pip install --no-cache-dir -r /home/user/comps/llms/summarization/vllm/langchain/requirements.txt
-
-ENV PYTHONPATH=$PYTHONPATH:/home/user
-
-WORKDIR /home/user/comps/llms/summarization/vllm/langchain
-
-ENTRYPOINT ["bash", "entrypoint.sh"]
--- a/comps/llms/summarization/vllm/langchain/README.md
+++ b/comps/llms/summarization/vllm/langchain/README.md
@@ -1,171 +0,0 @@
-# Document Summary vLLM Microservice
-
-This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using vLLM.
-[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference and serving, it delivers state-of-the-art serving throughput with a set of advanced features such as PagedAttention, Continuous batching and etc.. Besides GPUs, vLLM already supported [Intel CPUs](https://www.intel.com/content/www/us/en/products/overview.html) and [Gaudi accelerators](https://habana.ai/products).
-
-## 🚀1. Start Microservice with Python 🐍 (Option 1)
-
-To start the LLM microservice, you need to install python packages first.
-
-### 1.1 Install Requirements
-
-```bash
-pip install -r requirements.txt
-```
-
-### 1.2 Start LLM Service
-
-```bash
-export HF_TOKEN=${your_hf_api_token}
-export LLM_MODEL_ID=${your_hf_llm_model}
-docker run -p 8008:80 -v ./data:/data --name llm-docsum-vllm --shm-size 1g opea/vllm-gaudi:latest --model-id ${LLM_MODEL_ID}
-```
-
-### 1.3 Verify the vLLM Service
-
-```bash
-curl http://${your_ip}:8008/v1/chat/completions \
-    -X POST \
-    -H "Content-Type: application/json" \
-    -d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning? "}]}'
-```
-
-### 1.4 Start LLM Service with Python Script
-
-```bash
-export vLLM_ENDPOINT="http://${your_ip}:8008"
-python llm.py
-```
-
-## 🚀2. Start Microservice with Docker 🐳 (Option 2)
-
-If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a vLLM/vLLM service with docker.
-
-To setup or build the vLLM image follow the instructions provided in [vLLM Gaudi](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation/vllm/langchain#22-vllm-on-gaudi)
-
-### 2.1 Setup Environment Variables
-
-In order to start vLLM and LLM services, you need to setup the following environment variables first.
-
-```bash
-export HF_TOKEN=${your_hf_api_token}
-export vLLM_ENDPOINT="http://${your_ip}:8008"
-export LLM_MODEL_ID=${your_hf_llm_model}
-```
-
-### 2.2 Build Docker Image
-
-```bash
-cd ../../../../../
-docker build -t opea/llm-docsum-vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/summarization/vllm/langchain/Dockerfile .
-```
-
-To start a docker container, you have two options:
-
- A. Run Docker with CLI
- B. Run Docker with Docker Compose
-
-You can choose one as needed.
-
-### 2.3 Run Docker with CLI (Option A)
-
-```bash
-docker run -d --name="llm-docsum-vllm-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e vLLM_ENDPOINT=$vLLM_ENDPOINT -e HF_TOKEN=$HF_TOKEN opea/llm-docsum-vllm:latest
-```
-
-### 2.4 Run Docker with Docker Compose (Option B)
-
-```bash
-docker compose -f docker_compose_llm.yaml up -d
-```
-
-## 🚀3. Consume LLM Service
-
-### 3.1 Check Service Status
-
-```bash
-curl http://${your_ip}:9000/v1/health_check\
-  -X GET \
-  -H 'Content-Type: application/json'
-```
-
-### 3.2 Consume LLM Service
-
-In DocSum microservice, except for basic LLM parameters, we also support several optimization parameters setting.
-
- "language": specify the language, can be "auto", "en", "zh", default is "auto"
-
-If you want to deal with long context, can select suitable summary type, details in section 3.2.2.
-
- "summary_type": can be "auto", "stuff", "truncate", "map_reduce", "refine", default is "auto"
- "chunk_size": max token length for each chunk. Set to be different default value according to "summary_type".
- "chunk_overlap": overlap token length between each chunk, default is 0.1\*chunk_size
-
-#### 3.2.1 Basic usage
-
-```bash
-# Enable stream to receive a stream response. By default, this is set to True.
-curl http://${your_ip}:9000/v1/chat/docsum \
-  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \
-  -H 'Content-Type: application/json'
-
-# Disable stream to receive a non-stream response.
-curl http://${your_ip}:9000/v1/chat/docsum \
-  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' \
-  -H 'Content-Type: application/json'
-
-# Use Chinese mode
-curl http://${your_ip}:9000/v1/chat/docsum \
-  -X POST \
-  -d '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' \
-  -H 'Content-Type: application/json'
-```
-
-#### 3.2.2 Long context summarization with "summary_type"
-
-**summary_type=auto**
-
-"summary_type" is set to be "auto" by default, in this mode we will check input token length, if it exceed `MAX_INPUT_TOKENS`, `summary_type` will automatically be set to `refine` mode, otherwise will be set to `stuff` mode.
-
-**summary_type=stuff**
-
-In this mode LLM generate summary based on complete input text. In this case please carefully set `MAX_INPUT_TOKENS` and `MAX_TOTAL_TOKENS` according to your model and device memory, otherwise it may exceed LLM context limit and raise error when meet long context.
-
-**summary_type=truncate**
-
-Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
-
-```bash
-curl http://${your_ip}:9000/v1/chat/docsum \
-  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}' \
-  -H 'Content-Type: application/json'
-```
-
-**summary_type=map_reduce**
-
-Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here.
-
-In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
-
-```bash
-curl http://${your_ip}:9000/v1/chat/docsum \
-  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
-  -H 'Content-Type: application/json'
-```
-
-**summary_type=refine**
-
-Refin mode will split the inputs into multiple chunks, generate summary for the first one, then combine with the second, loops over every remaining chunks to get the final summary.
-
-In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`.
-
-```bash
-curl http://${your_ip}:9000/v1/chat/docsum \
-  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' \
-  -H 'Content-Type: application/json'
-```
--- a/comps/llms/summarization/vllm/langchain/init.py
+++ b/comps/llms/summarization/vllm/langchain/init.py
@@ -1,2 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
--- a/comps/llms/summarization/vllm/langchain/llm.py
+++ b/comps/llms/summarization/vllm/langchain/llm.py
@@ -1,247 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-from pathlib import Path as p
-
-from fastapi.responses import StreamingResponse
-from langchain.chains.summarize import load_summarize_chain
-from langchain.docstore.document import Document
-from langchain.prompts import PromptTemplate
-from langchain_community.llms import VLLMOpenAI
-from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
-from transformers import AutoTokenizer
-
-from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, ServiceType, opea_microservices, register_microservice
-from comps.cores.mega.utils import get_access_token
-
-logger = CustomLogger("llm_docsum")
-logflag = os.getenv("LOGFLAG", False)
-
-# Environment variables
-TOKEN_URL = os.getenv("TOKEN_URL")
-CLIENTID = os.getenv("CLIENTID")
-CLIENT_SECRET = os.getenv("CLIENT_SECRET")
-MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS"))
-MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS"))
-LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", None)
-
-templ_en = """Write a concise summary of the following:
-
-
-"{text}"
-
-
-CONCISE SUMMARY:"""
-
-templ_zh = """请简要概括以下内容:
-
-
-"{text}"
-
-
-概况:"""
-
-
-templ_refine_en = """Your job is to produce a final summary.
-We have provided an existing summary up to a certain point, then we will provide more context.
-You need to refine the existing summary (only if needed) with new context and generate a final summary.
-
-
-Existing Summary:
-"{existing_answer}"
-
-
-
-New Context:
-"{text}"
-
-
-
-Final Summary:
-
-"""
-
-templ_refine_zh = """\
-你的任务是生成一个最终摘要。
-我们已经处理好部分文本并生成初始摘要, 并提供了新的未处理文本
-你需要根据新提供的文本，结合初始摘要，生成一个最终摘要。
-
-
-初始摘要:
-"{existing_answer}"
-
-
-
-新的文本:
-"{text}"
-
-
-
-最终摘要:
-
-"""
-
-
-@register_microservice(
-    name="opea_service@llm_docsum",
-    service_type=ServiceType.LLM,
-    endpoint="/v1/chat/docsum",
-    host="0.0.0.0",
-    port=9000,
-)
-async def llm_generate(input: DocSumLLMParams):
-    if logflag:
-        logger.info(input)
-
-    ### check summary type
-    summary_types = ["auto", "stuff", "truncate", "map_reduce", "refine"]
-    if input.summary_type not in summary_types:
-        raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
-    if input.summary_type == "auto":  ### Check input token length in auto mode
-        token_len = len(tokenizer.encode(input.query))
-        if token_len > MAX_INPUT_TOKENS + 50:
-            input.summary_type = "refine"
-            if logflag:
-                logger.info(
-                    f"Input token length {token_len} exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'refine' mode."
-                )
-        else:
-            input.summary_type = "stuff"
-            if logflag:
-                logger.info(
-                    f"Input token length {token_len} not exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'stuff' mode."
-                )
-
-    if input.language in ["en", "auto"]:
-        templ = templ_en
-        templ_refine = templ_refine_en
-    elif input.language in ["zh"]:
-        templ = templ_zh
-        templ_refine = templ_refine_zh
-    else:
-        raise NotImplementedError('Please specify the input language in "en", "zh", "auto"')
-
-    ## Prompt
-    PROMPT = PromptTemplate.from_template(templ)
-    if input.summary_type == "refine":
-        PROMPT_REFINE = PromptTemplate.from_template(templ_refine)
-    if logflag:
-        logger.info("After prompting:")
-        logger.info(PROMPT)
-        if input.summary_type == "refine":
-            logger.info(PROMPT_REFINE)
-
-    ## Split text
-    if input.summary_type == "stuff":
-        text_splitter = CharacterTextSplitter()
-    else:
-        if input.summary_type == "refine":
-            if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128:
-                raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)")
-            max_input_tokens = min(
-                MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS
-            )  # 128 is reserved token length for prompt
-        else:
-            if MAX_TOTAL_TOKENS <= input.max_tokens + 50:
-                raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)")
-            max_input_tokens = min(
-                MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS
-            )  # 50 is reserved token length for prompt
-        chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens
-        chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size)
-        text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
-            tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
-        )
-        if logflag:
-            logger.info(f"set chunk size to: {chunk_size}")
-            logger.info(f"set chunk overlap to: {chunk_overlap}")
-
-    texts = text_splitter.split_text(input.query)
-    docs = [Document(page_content=t) for t in texts]
-    if logflag:
-        logger.info(f"Split input query into {len(docs)} chunks")
-        logger.info(f"The character length of the first chunk is {len(texts[0])}")
-
-    ## Access auth
-    access_token = (
-        get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
-    )
-    headers = {}
-    if access_token:
-        headers = {"Authorization": f"Bearer {access_token}"}
-
-    ## LLM
-    if input.stream and input.summary_type == "map_reduce":
-        logger.info("Map Reduce mode don't support stream=True, set to stream=False")
-        input.stream = False
-    llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8080")
-    model = input.model if input.model else os.getenv("LLM_MODEL_ID")
-    llm = VLLMOpenAI(
-        openai_api_key="EMPTY",
-        openai_api_base=llm_endpoint + "/v1",
-        model_name=model,
-        default_headers=headers,
-        max_tokens=input.max_tokens,
-        top_p=input.top_p,
-        streaming=input.stream,
-        temperature=input.temperature,
-        presence_penalty=input.repetition_penalty,
-    )
-
-    ## LLM chain
-    summary_type = input.summary_type
-    if summary_type == "stuff":
-        llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
-    elif summary_type == "truncate":
-        docs = [docs[0]]
-        llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
-    elif summary_type == "map_reduce":
-        llm_chain = load_summarize_chain(
-            llm=llm, map_prompt=PROMPT, combine_prompt=PROMPT, chain_type="map_reduce", return_intermediate_steps=True
-        )
-    elif summary_type == "refine":
-        llm_chain = load_summarize_chain(
-            llm=llm,
-            question_prompt=PROMPT,
-            refine_prompt=PROMPT_REFINE,
-            chain_type="refine",
-            return_intermediate_steps=True,
-        )
-    else:
-        raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')
-
-    if input.stream:
-
-        async def stream_generator():
-            from langserve.serialization import WellKnownLCSerializer
-
-            _serializer = WellKnownLCSerializer()
-            async for chunk in llm_chain.astream_log(docs):
-                data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
-                if logflag:
-                    logger.info(data)
-                yield f"data: {data}\n\n"
-            yield "data: [DONE]\n\n"
-
-        return StreamingResponse(stream_generator(), media_type="text/event-stream")
-    else:
-        response = await llm_chain.ainvoke(docs)
-
-        if input.summary_type in ["map_reduce", "refine"]:
-            intermediate_steps = response["intermediate_steps"]
-            if logflag:
-                logger.info("intermediate_steps:")
-                logger.info(intermediate_steps)
-
-        output_text = response["output_text"]
-        if logflag:
-            logger.info("\n\noutput_text:")
-            logger.info(output_text)
-
-        return GeneratedDoc(text=output_text, prompt=input.query)
-
-
-if __name__ == "__main__":
-    tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
-    opea_microservices["opea_service@llm_docsum"].start()
--- a/comps/llms/summarization/vllm/langchain/requirements-runtime.txt
+++ b/comps/llms/summarization/vllm/langchain/requirements-runtime.txt
@@ -1 +0,0 @@
-langserve
--- a/comps/llms/summarization/vllm/langchain/requirements.txt
+++ b/comps/llms/summarization/vllm/langchain/requirements.txt
@@ -1,16 +0,0 @@
-docarray[full]
-fastapi
-httpx==0.27.2
-huggingface_hub
-langchain #==0.1.12
-langchain-huggingface
-langchain-openai
-langchain_community
-langchainhub
-opentelemetry-api
-opentelemetry-exporter-otlp
-opentelemetry-sdk
-prometheus-fastapi-instrumentator
-shortuuid
-transformers
-uvicorn