Refactor llm Docsum (#1101)

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
This commit is contained in:
XinyaoWa
2025-01-13 15:24:43 +08:00
committed by GitHub
parent 3a7ccb0a75
commit 88f93733b0
29 changed files with 1196 additions and 962 deletions

View File

@@ -0,0 +1,52 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
version: "3.8"
services:
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
container_name: tgi-server
ports:
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- "./data:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
host_ip: ${host_ip}
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
healthcheck:
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
llm:
image: opea/llm-docsum:latest
container_name: llm-docsum-server
depends_on:
tgi-service:
condition: service_healthy
ports:
- ${DOCSUM_PORT:-9000}:9000
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
LLM_ENDPOINT: ${LLM_ENDPOINT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
LOGFLAG: ${LOGFLAG:-False}
restart: unless-stopped
networks:
default:
driver: bridge

View File

@@ -0,0 +1,63 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
version: "3.8"
services:
tgi-service:
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
container_name: tgi_gaudi_server
ports:
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- "./data:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
host_ip: ${host_ip}
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
runtime: habana
cap_add:
- SYS_NICE
ipc: host
healthcheck:
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
llm:
image: opea/llm-docsum:latest
container_name: llm-docsum-server
depends_on:
tgi-service:
condition: service_healthy
ports:
- ${DOCSUM_PORT:-9000}:9000
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
LLM_ENDPOINT: ${LLM_ENDPOINT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
LOGFLAG: ${LOGFLAG:-False}
restart: unless-stopped
networks:
default:
driver: bridge

View File

@@ -0,0 +1,55 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
version: "3.8"
services:
vllm-service:
image: opea/vllm:latest
container_name: vllm-server
ports:
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- "./data:/data"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
LLM_MODEL_ID: ${LLM_MODEL_ID}
VLLM_TORCH_PROFILER_DIR: "/mnt"
host_ip: ${host_ip}
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
healthcheck:
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
llm:
image: opea/llm-docsum:latest
container_name: llm-docsum-server
depends_on:
vllm-service:
condition: service_healthy
ports:
- ${DOCSUM_PORT:-9000}:9000
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
LLM_ENDPOINT: ${LLM_ENDPOINT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
LOGFLAG: ${LOGFLAG:-False}
restart: unless-stopped
networks:
default:
driver: bridge

View File

@@ -8,37 +8,52 @@ services:
image: opea/vllm-gaudi:latest
container_name: vllm-gaudi-server
ports:
- "8008:80"
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- "./data:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HF_TOKEN}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
LLM_MODEL_ID: ${LLM_MODEL_ID}
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
VLLM_TORCH_PROFILER_DIR: "/mnt"
host_ip: ${host_ip}
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
runtime: habana
cap_add:
- SYS_NICE
ipc: host
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80
healthcheck:
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq-len-to-capture ${MAX_INPUT_TOKENS}
llm:
image: opea/llm-docsum-vllm:latest
container_name: llm-docsum-vllm-server
image: opea/llm-docsum:latest
container_name: llm-docsum-server
depends_on:
vllm-service:
condition: service_healthy
ports:
- "9000:9000"
- ${DOCSUM_PORT:-9000}:9000
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
vLLM_ENDPOINT: ${vLLM_ENDPOINT}
HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
LLM_MODEL_ID: ${LLM_MODEL_ID}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
LLM_ENDPOINT: ${LLM_ENDPOINT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
LOGFLAG: ${LOGFLAG:-False}
restart: unless-stopped
networks:

View File

@@ -19,10 +19,10 @@ COPY comps /home/user/comps
RUN pip install --no-cache-dir --upgrade pip setuptools && \
if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
pip install --no-cache-dir -r /home/user/comps/llms/summarization/tgi/langchain/requirements.txt
pip install --no-cache-dir -r /home/user/comps/llms/src/doc-summarization/requirements.txt
ENV PYTHONPATH=$PYTHONPATH:/home/user
WORKDIR /home/user/comps/llms/summarization/tgi/langchain
WORKDIR /home/user/comps/llms/src/doc-summarization
ENTRYPOINT ["bash", "entrypoint.sh"]

View File

@@ -1,66 +1,44 @@
# Document Summary TGI Microservice
# Document Summary LLM Microservice
This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors.
[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more.
This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors. You can set backend service either [TGI](../../../third_parties/tgi) or [vLLM](../../../third_parties/vllm).
## 🚀1. Start Microservice with Python 🐍 (Option 1)
## 🚀1. Start Microservice with Docker 🐳
To start the LLM microservice, you need to install python packages first.
### 1.1 Setup Environment Variables
### 1.1 Install Requirements
In order to start DocSum services, you need to setup the following environment variables first.
```bash
pip install -r requirements.txt
```
### 1.2 Start LLM Service
```bash
export HF_TOKEN=${your_hf_api_token}
docker run -p 8008:80 -v ./data:/data --name llm-docsum-tgi --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.1.0 --model-id ${your_hf_llm_model}
```
### 1.3 Verify the TGI Service
```bash
curl http://${your_ip}:8008/v1/chat/completions \
-X POST \
-d '{"model": ${your_hf_llm_model}, "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
-H 'Content-Type: application/json'
```
### 1.4 Start LLM Service with Python Script
```bash
export TGI_LLM_ENDPOINT="http://${your_ip}:8008"
python llm.py
```
## 🚀2. Start Microservice with Docker 🐳 (Option 2)
If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a TGI/vLLM service with docker.
### 2.1 Setup Environment Variables
In order to start TGI and LLM services, you need to setup the following environment variables first.
```bash
export HF_TOKEN=${your_hf_api_token}
export TGI_LLM_ENDPOINT="http://${your_ip}:8008"
export host_ip=${your_host_ip}
export LLM_ENDPOINT_PORT=8008
export DOCSUM_PORT=9000
export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
export LLM_MODEL_ID=${your_hf_llm_model}
export MAX_INPUT_TOKENS=2048
export MAX_TOTAL_TOKENS=4096
export DocSum_COMPONENT_NAME="OPEADocSum_TGI" # or "OPEADocSum_vLLM"
```
Please make sure MAX_TOTAL_TOKENS should be larger than (MAX_INPUT_TOKENS + max_new_tokens + 50), 50 is reserved prompt length.
### 2.2 Build Docker Image
### 1.2 Build Docker Image
Step 1: Prepare backend LLM docker image.
If you want to use vLLM backend, refer to [vLLM](../../../third_parties/vllm/src) to build vLLM docker images first.
No need for TGI.
Step 2: Build FaqGen docker image.
```bash
cd ../../../../../
docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/summarization/tgi/langchain/Dockerfile .
cd ../../../../
docker build -t opea/llm-docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/summarization/Dockerfile .
```
### 1.3 Run Docker
To start a docker container, you have two options:
- A. Run Docker with CLI
@@ -68,16 +46,45 @@ To start a docker container, you have two options:
You can choose one as needed.
### 2.3 Run Docker with CLI (Option A)
### 1.3.1 Run Docker with CLI (Option A)
Step 1: Start the backend LLM service
Please refer to [TGI](../../../third_parties/tgi/deployment/docker_compose/) or [vLLM](../../../third_parties/vllm/deployment/docker_compose/) guideline to start a backend LLM service.
Step 2: Start the DocSum microservices
```bash
docker run -d --name="llm-docsum-tgi-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HF_TOKEN=$HF_TOKEN -e MAX_INPUT_TOKENS=${MAX_INPUT_TOKENS} -e MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} opea/llm-docsum-tgi:latest
docker run -d \
--name="llm-docsum-server" \
-p 9000:9000 \
--ipc=host \
-e http_proxy=$http_proxy \
-e https_proxy=$https_proxy \
-e LLM_MODEL_ID=$LLM_MODEL_ID \
-e LLM_ENDPOINT=$LLM_ENDPOINT \
-e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN \
-e DocSum_COMPONENT_NAME=$DocSum_COMPONENT_NAME \
-e MAX_INPUT_TOKENS=${MAX_INPUT_TOKENS} \
-e MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} \
opea/llm-docsum:latest
```
### 2.4 Run Docker with Docker Compose (Option B)
### 1.3.2 Run Docker with Docker Compose (Option B)
```bash
docker compose -f docker_compose_llm.yaml up -d
cd ../../deployment/docker_compose/
# Backend is TGI on xeon
docker compose -f doc-summarization_tgi.yaml up -d
# Backend is TGI on gaudi
# docker compose -f doc-summarization_tgi_on_intel_hpu.yaml up -d
# Backend is vLLM on xeon
# docker compose -f doc-summarization_vllm.yaml up -d
# Backend is vLLM on gaudi
# docker compose -f doc-summarization_vllm_on_intel_hpu.yaml up -d
```
## 🚀3. Consume LLM Service
@@ -106,19 +113,19 @@ If you want to deal with long context, can select suitable summary type, details
```bash
# Enable stream to receive a stream response. By default, this is set to True.
curl http://${your_ip}:9000/v1/chat/docsum \
curl http://${your_ip}:9000/v1/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \
-H 'Content-Type: application/json'
# Disable stream to receive a non-stream response.
curl http://${your_ip}:9000/v1/chat/docsum \
curl http://${your_ip}:9000/v1/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' \
-H 'Content-Type: application/json'
# Use Chinese mode
curl http://${your_ip}:9000/v1/chat/docsum \
curl http://${your_ip}:9000/v1/docsum \
-X POST \
-d '{"query":"2024年9月26日北京——今日英特尔正式发布英特尔® 至强® 6性能核处理器代号Granite Rapids为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' \
-H 'Content-Type: application/json'
@@ -139,7 +146,7 @@ In this mode LLM generate summary based on complete input text. In this case ple
Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
```bash
curl http://${your_ip}:9000/v1/chat/docsum \
curl http://${your_ip}:9000/v1/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}' \
-H 'Content-Type: application/json'
@@ -152,7 +159,7 @@ Map_reduce mode will split the inputs into multiple chunks, map each document to
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
```bash
curl http://${your_ip}:9000/v1/chat/docsum \
curl http://${your_ip}:9000/v1/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
-H 'Content-Type: application/json'
@@ -165,7 +172,7 @@ Refin mode will split the inputs into multiple chunks, generate summary for the
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`.
```bash
curl http://${your_ip}:9000/v1/chat/docsum \
curl http://${your_ip}:9000/v1/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' \
-H 'Content-Type: application/json'

View File

@@ -5,4 +5,4 @@
pip --no-cache-dir install -r requirements-runtime.txt
python llm.py
python opea_docsum_microservice.py

View File

@@ -0,0 +1,204 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
import requests
from fastapi.responses import StreamingResponse
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from transformers import AutoTokenizer
from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, ServiceType
from comps.cores.mega.utils import ConfigError, get_access_token, load_model_configs
from .template import templ_en, templ_refine_en, templ_refine_zh, templ_zh
logger = CustomLogger("llm_docsum")
logflag = os.getenv("LOGFLAG", False)
# Environment variables
MODEL_NAME = os.getenv("LLM_MODEL_ID")
MODEL_CONFIGS = os.getenv("MODEL_CONFIGS")
TOKEN_URL = os.getenv("TOKEN_URL")
CLIENTID = os.getenv("CLIENTID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", 2048))
MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS", 4096))
if os.getenv("LLM_ENDPOINT") is not None:
DEFAULT_ENDPOINT = os.getenv("LLM_ENDPOINT")
elif os.getenv("TGI_LLM_ENDPOINT") is not None:
DEFAULT_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT")
elif os.getenv("vLLM_ENDPOINT") is not None:
DEFAULT_ENDPOINT = os.getenv("vLLM_ENDPOINT")
else:
DEFAULT_ENDPOINT = "http://localhost:8080"
def get_llm_endpoint():
if not MODEL_CONFIGS:
return DEFAULT_ENDPOINT
else:
# Validate and Load the models config if MODEL_CONFIGS is not null
configs_map = {}
try:
configs_map = load_model_configs(MODEL_CONFIGS)
except ConfigError as e:
logger.error(f"Failed to load model configurations: {e}")
raise ConfigError(f"Failed to load model configurations: {e}")
try:
return configs_map.get(MODEL_NAME).get("endpoint")
except ConfigError as e:
logger.error(f"Input model {MODEL_NAME} not present in model_configs. Error {e}")
raise ConfigError(f"Input model {MODEL_NAME} not present in model_configs")
class OPEADocSum(OpeaComponent):
"""A specialized OPEA DocSum component derived from OpeaComponent.
Attributes:
client (TGI/vLLM): An instance of the TGI/vLLM client for text generation.
"""
def __init__(self, name: str, description: str, config: dict = None):
super().__init__(name, ServiceType.LLM.name.lower(), description, config)
self.access_token = (
get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
)
self.llm_endpoint = get_llm_endpoint()
self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
health_status = self.check_health()
if not health_status:
logger.error("OPEADocSum health check failed.")
async def generate(self, input: DocSumLLMParams, client):
"""Invokes the TGI/vLLM LLM service to generate summarization for the provided input.
Args:
input (DocSumLLMParams): The input text(s).
client: TGI/vLLM based client
"""
### check summary type
summary_types = ["auto", "stuff", "truncate", "map_reduce", "refine"]
if input.summary_type not in summary_types:
raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
if input.summary_type == "auto": ### Check input token length in auto mode
token_len = len(self.tokenizer.encode(input.query))
if token_len > MAX_INPUT_TOKENS + 50:
input.summary_type = "refine"
if logflag:
logger.info(
f"Input token length {token_len} exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'refine' mode."
)
else:
input.summary_type = "stuff"
if logflag:
logger.info(
f"Input token length {token_len} not exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'stuff' mode."
)
### Check input language
if input.language in ["en", "auto"]:
templ = templ_en
templ_refine = templ_refine_en
elif input.language in ["zh"]:
templ = templ_zh
templ_refine = templ_refine_zh
else:
raise NotImplementedError('Please specify the input language in "en", "zh", "auto"')
## Prompt
PROMPT = PromptTemplate.from_template(templ)
if input.summary_type == "refine":
PROMPT_REFINE = PromptTemplate.from_template(templ_refine)
if logflag:
logger.info("After prompting:")
logger.info(PROMPT)
if input.summary_type == "refine":
logger.info(PROMPT_REFINE)
## Split text
if input.summary_type == "stuff":
text_splitter = CharacterTextSplitter()
else:
if input.summary_type == "refine":
if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128: ## 128 is reserved prompt length
raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)")
max_input_tokens = min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)
else:
if MAX_TOTAL_TOKENS <= input.max_tokens + 50: # 50 is reserved token length for prompt
raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)")
max_input_tokens = min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)
chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens
chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size)
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
tokenizer=self.tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
if logflag:
logger.info(f"set chunk size to: {chunk_size}")
logger.info(f"set chunk overlap to: {chunk_overlap}")
texts = text_splitter.split_text(input.query)
docs = [Document(page_content=t) for t in texts]
if logflag:
logger.info(f"Split input query into {len(docs)} chunks")
logger.info(f"The character length of the first chunk is {len(texts[0])}")
## LLM chain
summary_type = input.summary_type
if summary_type == "stuff":
llm_chain = load_summarize_chain(llm=client, prompt=PROMPT)
elif summary_type == "truncate":
docs = [docs[0]]
llm_chain = load_summarize_chain(llm=client, prompt=PROMPT)
elif summary_type == "map_reduce":
llm_chain = load_summarize_chain(
llm=client,
map_prompt=PROMPT,
combine_prompt=PROMPT,
chain_type="map_reduce",
return_intermediate_steps=True,
)
elif summary_type == "refine":
llm_chain = load_summarize_chain(
llm=client,
question_prompt=PROMPT,
refine_prompt=PROMPT_REFINE,
chain_type="refine",
return_intermediate_steps=True,
)
else:
raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
if input.stream:
async def stream_generator():
from langserve.serialization import WellKnownLCSerializer
_serializer = WellKnownLCSerializer()
async for chunk in llm_chain.astream_log(docs):
data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
if logflag:
logger.info(data)
yield f"data: {data}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
response = await llm_chain.ainvoke(docs)
if input.summary_type in ["map_reduce", "refine"]:
intermediate_steps = response["intermediate_steps"]
if logflag:
logger.info("intermediate_steps:")
logger.info(intermediate_steps)
output_text = response["output_text"]
if logflag:
logger.info("\n\noutput_text:")
logger.info(output_text)
return GeneratedDoc(text=output_text, prompt=input.query)

View File

@@ -0,0 +1,58 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
templ_en = """Write a concise summary of the following:
"{text}"
CONCISE SUMMARY:"""
templ_zh = """请简要概括以下内容:
"{text}"
概况:"""
templ_refine_en = """Your job is to produce a final summary.
We have provided an existing summary up to a certain point, then we will provide more context.
You need to refine the existing summary (only if needed) with new context and generate a final summary.
Existing Summary:
"{existing_answer}"
New Context:
"{text}"
Final Summary:
"""
templ_refine_zh = """\
你的任务是生成一个最终摘要。
我们已经处理好部分文本并生成初始摘要, 并提供了新的未处理文本
你需要根据新提供的文本,结合初始摘要,生成一个最终摘要。
初始摘要:
"{existing_answer}"
新的文本:
"{text}"
最终摘要:
"""

View File

@@ -0,0 +1,76 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
import requests
from langchain_community.llms import HuggingFaceEndpoint
from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
from .common import *
logger = CustomLogger("llm_docsum_tgi")
logflag = os.getenv("LOGFLAG", False)
@OpeaComponentRegistry.register("OPEADocSum_TGI")
class OPEADocSum_TGI(OPEADocSum):
"""A specialized OPEA DocSum TGI component derived from OPEADocSum for interacting with TGI services based on Lanchain HuggingFaceEndpoint API.
Attributes:
client (TGI): An instance of the TGI client for text generation.
"""
def check_health(self) -> bool:
"""Checks the health of the TGI LLM service.
Returns:
bool: True if the service is reachable and healthy, False otherwise.
"""
try:
# response = requests.get(f"{self.llm_endpoint}/health")
# Will remove after TGI gaudi fix health bug
url = f"{self.llm_endpoint}/generate"
data = {"inputs": "What is Deep Learning?", "parameters": {"max_new_tokens": 17}}
headers = {"Content-Type": "application/json"}
response = requests.post(url=url, json=data, headers=headers)
if response.status_code == 200:
return True
else:
return False
except Exception as e:
logger.error(e)
logger.error("Health check failed")
return False
async def invoke(self, input: DocSumLLMParams):
"""Invokes the TGI LLM service to generate summarization output for the provided input.
Args:
input (DocSumLLMParams): The input text(s).
"""
server_kwargs = {}
if self.access_token:
server_kwargs["headers"] = {"Authorization": f"Bearer {self.access_token}"}
if input.stream and input.summary_type == "map_reduce":
logger.info("Map Reduce mode don't support stream=True, set to stream=False")
input.stream = False
self.client = HuggingFaceEndpoint(
endpoint_url=self.llm_endpoint,
max_new_tokens=input.max_tokens,
top_k=input.top_k,
top_p=input.top_p,
typical_p=input.typical_p,
temperature=input.temperature,
repetition_penalty=input.repetition_penalty,
streaming=input.stream,
server_kwargs=server_kwargs,
)
result = await self.generate(input, self.client)
return result

View File

@@ -0,0 +1,69 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
import requests
from langchain_community.llms import VLLMOpenAI
from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
from .common import *
logger = CustomLogger("llm_docsum_vllm")
logflag = os.getenv("LOGFLAG", False)
@OpeaComponentRegistry.register("OPEADocSum_vLLM")
class OPEADocSum_vLLM(OPEADocSum):
"""A specialized OPEA DocSum vLLM component derived from OPEADocSum for interacting with vLLM services based on Lanchain VLLMOpenAI API.
Attributes:
client (vLLM): An instance of the vLLM client for text generation.
"""
def check_health(self) -> bool:
"""Checks the health of the vLLM LLM service.
Returns:
bool: True if the service is reachable and healthy, False otherwise.
"""
try:
response = requests.get(f"{self.llm_endpoint}/health")
if response.status_code == 200:
return True
else:
return False
except Exception as e:
logger.error(e)
logger.error("Health check failed")
return False
async def invoke(self, input: DocSumLLMParams):
"""Invokes the vLLM LLM service to generate summarization output for the provided input.
Args:
input (DocSumLLMParams): The input text(s).
"""
headers = {}
if self.access_token:
headers = {"Authorization": f"Bearer {self.access_token}"}
if input.stream and input.summary_type == "map_reduce":
logger.info("Map Reduce mode don't support stream=True, set to stream=False")
input.stream = False
self.client = VLLMOpenAI(
openai_api_key="EMPTY",
openai_api_base=self.llm_endpoint + "/v1",
model_name=MODEL_NAME,
default_headers=headers,
max_tokens=input.max_tokens,
top_p=input.top_p,
streaming=input.stream,
temperature=input.temperature,
presence_penalty=input.repetition_penalty,
)
result = await self.generate(input, self.client)
return result

View File

@@ -0,0 +1,58 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
import time
from integrations.tgi import OPEADocSum_TGI
from integrations.vllm import OPEADocSum_vLLM
from comps import (
CustomLogger,
DocSumLLMParams,
OpeaComponentLoader,
ServiceType,
opea_microservices,
register_microservice,
register_statistics,
statistics_dict,
)
logger = CustomLogger("llm_docsum")
logflag = os.getenv("LOGFLAG", False)
llm_component_name = os.getenv("DocSum_COMPONENT_NAME", "OPEADocSum_TGI")
# Initialize OpeaComponentLoader
loader = OpeaComponentLoader(llm_component_name, description=f"OPEA LLM DocSum Component: {llm_component_name}")
@register_microservice(
name="opea_service@llm_docsum",
service_type=ServiceType.LLM,
endpoint="/v1/docsum",
host="0.0.0.0",
port=9000,
)
@register_statistics(names=["opea_service@llm_docsum"])
async def llm_generate(input: DocSumLLMParams):
start = time.time()
# Log the input if logging is enabled
if logflag:
logger.info(input)
try:
# Use the controller to invoke the active component
response = await loader.invoke(input)
# Record statistics
statistics_dict["opea_service@llm_docsum"].append_latency(time.time() - start, None)
return response
except Exception as e:
logger.error(f"Error during DocSum invocation: {e}")
raise
if __name__ == "__main__":
logger.info("OPEA DocSum Microservice is starting...")
opea_microservices["opea_service@llm_docsum"].start()

View File

@@ -1,37 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
version: "3.8"
services:
tgi_service:
image: ghcr.io/huggingface/text-generation-inference:2.1.0
container_name: tgi-service
ports:
- "8008:80"
volumes:
- "./data:/data"
environment:
HF_TOKEN: ${HF_TOKEN}
shm_size: 1g
command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
llm:
image: opea/llm-docsum-tgi:latest
container_name: llm-docsum-tgi-server
ports:
- "9000:9000"
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
LLM_MODEL_ID: ${LLM_MODEL_ID}
restart: unless-stopped
networks:
default:
driver: bridge

View File

@@ -1,8 +0,0 @@
#!/usr/bin/env bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
pip --no-cache-dir install -r requirements-runtime.txt
python llm.py

View File

@@ -1,245 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
from fastapi.responses import StreamingResponse
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFaceEndpoint
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, ServiceType, opea_microservices, register_microservice
from comps.cores.mega.utils import get_access_token
logger = CustomLogger("llm_docsum")
logflag = os.getenv("LOGFLAG", False)
# Environment variables
TOKEN_URL = os.getenv("TOKEN_URL")
CLIENTID = os.getenv("CLIENTID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", 2048))
MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS", 4096))
LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
templ_en = """Write a concise summary of the following:
"{text}"
CONCISE SUMMARY:"""
templ_zh = """请简要概括以下内容:
"{text}"
概况:"""
templ_refine_en = """Your job is to produce a final summary.
We have provided an existing summary up to a certain point, then we will provide more context.
You need to refine the existing summary (only if needed) with new context and generate a final summary.
Existing Summary:
"{existing_answer}"
New Context:
"{text}"
Final Summary:
"""
templ_refine_zh = """\
你的任务是生成一个最终摘要。
我们已经处理好部分文本并生成初始摘要, 并提供了新的未处理文本
你需要根据新提供的文本,结合初始摘要,生成一个最终摘要。
初始摘要:
"{existing_answer}"
新的文本:
"{text}"
最终摘要:
"""
@register_microservice(
name="opea_service@llm_docsum",
service_type=ServiceType.LLM,
endpoint="/v1/chat/docsum",
host="0.0.0.0",
port=9000,
)
async def llm_generate(input: DocSumLLMParams):
if logflag:
logger.info(input)
### check summary type
summary_types = ["auto", "stuff", "truncate", "map_reduce", "refine"]
if input.summary_type not in summary_types:
raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
if input.summary_type == "auto": ### Check input token length in auto mode
token_len = len(tokenizer.encode(input.query))
if token_len > MAX_INPUT_TOKENS + 50:
input.summary_type = "refine"
if logflag:
logger.info(
f"Input token length {token_len} exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'refine' mode."
)
else:
input.summary_type = "stuff"
if logflag:
logger.info(
f"Input token length {token_len} not exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'stuff' mode."
)
if input.language in ["en", "auto"]:
templ = templ_en
templ_refine = templ_refine_en
elif input.language in ["zh"]:
templ = templ_zh
templ_refine = templ_refine_zh
else:
raise NotImplementedError('Please specify the input language in "en", "zh", "auto"')
## Prompt
PROMPT = PromptTemplate.from_template(templ)
if input.summary_type == "refine":
PROMPT_REFINE = PromptTemplate.from_template(templ_refine)
if logflag:
logger.info("After prompting:")
logger.info(PROMPT)
if input.summary_type == "refine":
logger.info(PROMPT_REFINE)
## Split text
if input.summary_type == "stuff":
text_splitter = CharacterTextSplitter()
else:
if input.summary_type == "refine":
if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128:
raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)")
max_input_tokens = min(
MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS
) # 128 is reserved token length for prompt
else:
if MAX_TOTAL_TOKENS <= input.max_tokens + 50:
raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)")
max_input_tokens = min(
MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS
) # 50 is reserved token length for prompt
chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens
chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size)
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
if logflag:
logger.info(f"set chunk size to: {chunk_size}")
logger.info(f"set chunk overlap to: {chunk_overlap}")
texts = text_splitter.split_text(input.query)
docs = [Document(page_content=t) for t in texts]
if logflag:
logger.info(f"Split input query into {len(docs)} chunks")
logger.info(f"The character length of the first chunk is {len(texts[0])}")
## Access auth
access_token = (
get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
)
server_kwargs = {}
if access_token:
server_kwargs["headers"] = {"Authorization": f"Bearer {access_token}"}
## LLM
if input.stream and input.summary_type == "map_reduce":
logger.info("Map Reduce mode don't support stream=True, set to stream=False")
input.stream = False
llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
llm = HuggingFaceEndpoint(
endpoint_url=llm_endpoint,
max_new_tokens=input.max_tokens,
top_k=input.top_k,
top_p=input.top_p,
typical_p=input.typical_p,
temperature=input.temperature,
repetition_penalty=input.repetition_penalty,
streaming=input.stream,
server_kwargs=server_kwargs,
)
## LLM chain
summary_type = input.summary_type
if summary_type == "stuff":
llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
elif summary_type == "truncate":
docs = [docs[0]]
llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
elif summary_type == "map_reduce":
llm_chain = load_summarize_chain(
llm=llm, map_prompt=PROMPT, combine_prompt=PROMPT, chain_type="map_reduce", return_intermediate_steps=True
)
elif summary_type == "refine":
llm_chain = load_summarize_chain(
llm=llm,
question_prompt=PROMPT,
refine_prompt=PROMPT_REFINE,
chain_type="refine",
return_intermediate_steps=True,
)
else:
raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')
if input.stream:
async def stream_generator():
from langserve.serialization import WellKnownLCSerializer
_serializer = WellKnownLCSerializer()
async for chunk in llm_chain.astream_log(docs):
data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
if logflag:
logger.info(data)
yield f"data: {data}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
response = await llm_chain.ainvoke(docs)
if input.summary_type in ["map_reduce", "refine"]:
intermediate_steps = response["intermediate_steps"]
if logflag:
logger.info("intermediate_steps:")
logger.info(intermediate_steps)
output_text = response["output_text"]
if logflag:
logger.info("\n\noutput_text:")
logger.info(output_text)
return GeneratedDoc(text=output_text, prompt=input.query)
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
opea_microservices["opea_service@llm_docsum"].start()

View File

@@ -1,28 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
FROM python:3.11-slim
ARG ARCH="cpu"
RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
libgl1-mesa-glx \
libjemalloc-dev
RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/
USER user
COPY comps /home/user/comps
RUN pip install --no-cache-dir --upgrade pip setuptools && \
if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
pip install --no-cache-dir -r /home/user/comps/llms/summarization/vllm/langchain/requirements.txt
ENV PYTHONPATH=$PYTHONPATH:/home/user
WORKDIR /home/user/comps/llms/summarization/vllm/langchain
ENTRYPOINT ["bash", "entrypoint.sh"]

View File

@@ -1,171 +0,0 @@
# Document Summary vLLM Microservice
This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using vLLM.
[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference and serving, it delivers state-of-the-art serving throughput with a set of advanced features such as PagedAttention, Continuous batching and etc.. Besides GPUs, vLLM already supported [Intel CPUs](https://www.intel.com/content/www/us/en/products/overview.html) and [Gaudi accelerators](https://habana.ai/products).
## 🚀1. Start Microservice with Python 🐍 (Option 1)
To start the LLM microservice, you need to install python packages first.
### 1.1 Install Requirements
```bash
pip install -r requirements.txt
```
### 1.2 Start LLM Service
```bash
export HF_TOKEN=${your_hf_api_token}
export LLM_MODEL_ID=${your_hf_llm_model}
docker run -p 8008:80 -v ./data:/data --name llm-docsum-vllm --shm-size 1g opea/vllm-gaudi:latest --model-id ${LLM_MODEL_ID}
```
### 1.3 Verify the vLLM Service
```bash
curl http://${your_ip}:8008/v1/chat/completions \
-X POST \
-H "Content-Type: application/json" \
-d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning? "}]}'
```
### 1.4 Start LLM Service with Python Script
```bash
export vLLM_ENDPOINT="http://${your_ip}:8008"
python llm.py
```
## 🚀2. Start Microservice with Docker 🐳 (Option 2)
If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a vLLM/vLLM service with docker.
To setup or build the vLLM image follow the instructions provided in [vLLM Gaudi](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation/vllm/langchain#22-vllm-on-gaudi)
### 2.1 Setup Environment Variables
In order to start vLLM and LLM services, you need to setup the following environment variables first.
```bash
export HF_TOKEN=${your_hf_api_token}
export vLLM_ENDPOINT="http://${your_ip}:8008"
export LLM_MODEL_ID=${your_hf_llm_model}
```
### 2.2 Build Docker Image
```bash
cd ../../../../../
docker build -t opea/llm-docsum-vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/summarization/vllm/langchain/Dockerfile .
```
To start a docker container, you have two options:
- A. Run Docker with CLI
- B. Run Docker with Docker Compose
You can choose one as needed.
### 2.3 Run Docker with CLI (Option A)
```bash
docker run -d --name="llm-docsum-vllm-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e vLLM_ENDPOINT=$vLLM_ENDPOINT -e HF_TOKEN=$HF_TOKEN opea/llm-docsum-vllm:latest
```
### 2.4 Run Docker with Docker Compose (Option B)
```bash
docker compose -f docker_compose_llm.yaml up -d
```
## 🚀3. Consume LLM Service
### 3.1 Check Service Status
```bash
curl http://${your_ip}:9000/v1/health_check\
-X GET \
-H 'Content-Type: application/json'
```
### 3.2 Consume LLM Service
In DocSum microservice, except for basic LLM parameters, we also support several optimization parameters setting.
- "language": specify the language, can be "auto", "en", "zh", default is "auto"
If you want to deal with long context, can select suitable summary type, details in section 3.2.2.
- "summary_type": can be "auto", "stuff", "truncate", "map_reduce", "refine", default is "auto"
- "chunk_size": max token length for each chunk. Set to be different default value according to "summary_type".
- "chunk_overlap": overlap token length between each chunk, default is 0.1\*chunk_size
#### 3.2.1 Basic usage
```bash
# Enable stream to receive a stream response. By default, this is set to True.
curl http://${your_ip}:9000/v1/chat/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \
-H 'Content-Type: application/json'
# Disable stream to receive a non-stream response.
curl http://${your_ip}:9000/v1/chat/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' \
-H 'Content-Type: application/json'
# Use Chinese mode
curl http://${your_ip}:9000/v1/chat/docsum \
-X POST \
-d '{"query":"2024年9月26日北京——今日英特尔正式发布英特尔® 至强® 6性能核处理器代号Granite Rapids为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' \
-H 'Content-Type: application/json'
```
#### 3.2.2 Long context summarization with "summary_type"
**summary_type=auto**
"summary_type" is set to be "auto" by default, in this mode we will check input token length, if it exceed `MAX_INPUT_TOKENS`, `summary_type` will automatically be set to `refine` mode, otherwise will be set to `stuff` mode.
**summary_type=stuff**
In this mode LLM generate summary based on complete input text. In this case please carefully set `MAX_INPUT_TOKENS` and `MAX_TOTAL_TOKENS` according to your model and device memory, otherwise it may exceed LLM context limit and raise error when meet long context.
**summary_type=truncate**
Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
```bash
curl http://${your_ip}:9000/v1/chat/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}' \
-H 'Content-Type: application/json'
```
**summary_type=map_reduce**
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here.
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
```bash
curl http://${your_ip}:9000/v1/chat/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
-H 'Content-Type: application/json'
```
**summary_type=refine**
Refin mode will split the inputs into multiple chunks, generate summary for the first one, then combine with the second, loops over every remaining chunks to get the final summary.
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`.
```bash
curl http://${your_ip}:9000/v1/chat/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' \
-H 'Content-Type: application/json'
```

View File

@@ -1,2 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

View File

@@ -1,247 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
from pathlib import Path as p
from fastapi.responses import StreamingResponse
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain_community.llms import VLLMOpenAI
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, ServiceType, opea_microservices, register_microservice
from comps.cores.mega.utils import get_access_token
logger = CustomLogger("llm_docsum")
logflag = os.getenv("LOGFLAG", False)
# Environment variables
TOKEN_URL = os.getenv("TOKEN_URL")
CLIENTID = os.getenv("CLIENTID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS"))
MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS"))
LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", None)
templ_en = """Write a concise summary of the following:
"{text}"
CONCISE SUMMARY:"""
templ_zh = """请简要概括以下内容:
"{text}"
概况:"""
templ_refine_en = """Your job is to produce a final summary.
We have provided an existing summary up to a certain point, then we will provide more context.
You need to refine the existing summary (only if needed) with new context and generate a final summary.
Existing Summary:
"{existing_answer}"
New Context:
"{text}"
Final Summary:
"""
templ_refine_zh = """\
你的任务是生成一个最终摘要。
我们已经处理好部分文本并生成初始摘要, 并提供了新的未处理文本
你需要根据新提供的文本,结合初始摘要,生成一个最终摘要。
初始摘要:
"{existing_answer}"
新的文本:
"{text}"
最终摘要:
"""
@register_microservice(
name="opea_service@llm_docsum",
service_type=ServiceType.LLM,
endpoint="/v1/chat/docsum",
host="0.0.0.0",
port=9000,
)
async def llm_generate(input: DocSumLLMParams):
if logflag:
logger.info(input)
### check summary type
summary_types = ["auto", "stuff", "truncate", "map_reduce", "refine"]
if input.summary_type not in summary_types:
raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
if input.summary_type == "auto": ### Check input token length in auto mode
token_len = len(tokenizer.encode(input.query))
if token_len > MAX_INPUT_TOKENS + 50:
input.summary_type = "refine"
if logflag:
logger.info(
f"Input token length {token_len} exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'refine' mode."
)
else:
input.summary_type = "stuff"
if logflag:
logger.info(
f"Input token length {token_len} not exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'stuff' mode."
)
if input.language in ["en", "auto"]:
templ = templ_en
templ_refine = templ_refine_en
elif input.language in ["zh"]:
templ = templ_zh
templ_refine = templ_refine_zh
else:
raise NotImplementedError('Please specify the input language in "en", "zh", "auto"')
## Prompt
PROMPT = PromptTemplate.from_template(templ)
if input.summary_type == "refine":
PROMPT_REFINE = PromptTemplate.from_template(templ_refine)
if logflag:
logger.info("After prompting:")
logger.info(PROMPT)
if input.summary_type == "refine":
logger.info(PROMPT_REFINE)
## Split text
if input.summary_type == "stuff":
text_splitter = CharacterTextSplitter()
else:
if input.summary_type == "refine":
if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128:
raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)")
max_input_tokens = min(
MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS
) # 128 is reserved token length for prompt
else:
if MAX_TOTAL_TOKENS <= input.max_tokens + 50:
raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)")
max_input_tokens = min(
MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS
) # 50 is reserved token length for prompt
chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens
chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size)
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
if logflag:
logger.info(f"set chunk size to: {chunk_size}")
logger.info(f"set chunk overlap to: {chunk_overlap}")
texts = text_splitter.split_text(input.query)
docs = [Document(page_content=t) for t in texts]
if logflag:
logger.info(f"Split input query into {len(docs)} chunks")
logger.info(f"The character length of the first chunk is {len(texts[0])}")
## Access auth
access_token = (
get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
)
headers = {}
if access_token:
headers = {"Authorization": f"Bearer {access_token}"}
## LLM
if input.stream and input.summary_type == "map_reduce":
logger.info("Map Reduce mode don't support stream=True, set to stream=False")
input.stream = False
llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8080")
model = input.model if input.model else os.getenv("LLM_MODEL_ID")
llm = VLLMOpenAI(
openai_api_key="EMPTY",
openai_api_base=llm_endpoint + "/v1",
model_name=model,
default_headers=headers,
max_tokens=input.max_tokens,
top_p=input.top_p,
streaming=input.stream,
temperature=input.temperature,
presence_penalty=input.repetition_penalty,
)
## LLM chain
summary_type = input.summary_type
if summary_type == "stuff":
llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
elif summary_type == "truncate":
docs = [docs[0]]
llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
elif summary_type == "map_reduce":
llm_chain = load_summarize_chain(
llm=llm, map_prompt=PROMPT, combine_prompt=PROMPT, chain_type="map_reduce", return_intermediate_steps=True
)
elif summary_type == "refine":
llm_chain = load_summarize_chain(
llm=llm,
question_prompt=PROMPT,
refine_prompt=PROMPT_REFINE,
chain_type="refine",
return_intermediate_steps=True,
)
else:
raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')
if input.stream:
async def stream_generator():
from langserve.serialization import WellKnownLCSerializer
_serializer = WellKnownLCSerializer()
async for chunk in llm_chain.astream_log(docs):
data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
if logflag:
logger.info(data)
yield f"data: {data}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
response = await llm_chain.ainvoke(docs)
if input.summary_type in ["map_reduce", "refine"]:
intermediate_steps = response["intermediate_steps"]
if logflag:
logger.info("intermediate_steps:")
logger.info(intermediate_steps)
output_text = response["output_text"]
if logflag:
logger.info("\n\noutput_text:")
logger.info(output_text)
return GeneratedDoc(text=output_text, prompt=input.query)
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
opea_microservices["opea_service@llm_docsum"].start()

View File

@@ -1,16 +0,0 @@
docarray[full]
fastapi
httpx==0.27.2
huggingface_hub
langchain #==0.1.12
langchain-huggingface
langchain-openai
langchain_community
langchainhub
opentelemetry-api
opentelemetry-exporter-otlp
opentelemetry-sdk
prometheus-fastapi-instrumentator
shortuuid
transformers
uvicorn