Refactor llm Docsum (#1101)
Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
This commit is contained in:
@@ -0,0 +1,52 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
tgi-service:
|
||||
image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
|
||||
container_name: tgi-server
|
||||
ports:
|
||||
- ${LLM_ENDPOINT_PORT:-8008}:80
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
shm_size: 1g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
host_ip: ${host_ip}
|
||||
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 100
|
||||
command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
|
||||
llm:
|
||||
image: opea/llm-docsum:latest
|
||||
container_name: llm-docsum-server
|
||||
depends_on:
|
||||
tgi-service:
|
||||
condition: service_healthy
|
||||
ports:
|
||||
- ${DOCSUM_PORT:-9000}:9000
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
|
||||
MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
|
||||
LLM_ENDPOINT: ${LLM_ENDPOINT}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
|
||||
LOGFLAG: ${LOGFLAG:-False}
|
||||
restart: unless-stopped
|
||||
|
||||
networks:
|
||||
default:
|
||||
driver: bridge
|
||||
@@ -0,0 +1,63 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
tgi-service:
|
||||
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
|
||||
container_name: tgi_gaudi_server
|
||||
ports:
|
||||
- ${LLM_ENDPOINT_PORT:-8008}:80
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
HF_HUB_DISABLE_PROGRESS_BARS: 1
|
||||
HF_HUB_ENABLE_HF_TRANSFER: 0
|
||||
HABANA_VISIBLE_DEVICES: all
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
ENABLE_HPU_GRAPH: true
|
||||
LIMIT_HPU_GRAPH: true
|
||||
USE_FLASH_ATTENTION: true
|
||||
FLASH_ATTENTION_RECOMPUTE: true
|
||||
host_ip: ${host_ip}
|
||||
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
|
||||
runtime: habana
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
ipc: host
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 100
|
||||
command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
|
||||
llm:
|
||||
image: opea/llm-docsum:latest
|
||||
container_name: llm-docsum-server
|
||||
depends_on:
|
||||
tgi-service:
|
||||
condition: service_healthy
|
||||
ports:
|
||||
- ${DOCSUM_PORT:-9000}:9000
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
|
||||
MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
|
||||
LLM_ENDPOINT: ${LLM_ENDPOINT}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
|
||||
LOGFLAG: ${LOGFLAG:-False}
|
||||
restart: unless-stopped
|
||||
|
||||
networks:
|
||||
default:
|
||||
driver: bridge
|
||||
@@ -0,0 +1,55 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
vllm-service:
|
||||
image: opea/vllm:latest
|
||||
container_name: vllm-server
|
||||
ports:
|
||||
- ${LLM_ENDPOINT_PORT:-8008}:80
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
shm_size: 128g
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
||||
host_ip: ${host_ip}
|
||||
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
|
||||
VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 100
|
||||
command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
|
||||
llm:
|
||||
image: opea/llm-docsum:latest
|
||||
container_name: llm-docsum-server
|
||||
depends_on:
|
||||
vllm-service:
|
||||
condition: service_healthy
|
||||
ports:
|
||||
- ${DOCSUM_PORT:-9000}:9000
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
|
||||
MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
|
||||
LLM_ENDPOINT: ${LLM_ENDPOINT}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
|
||||
LOGFLAG: ${LOGFLAG:-False}
|
||||
restart: unless-stopped
|
||||
|
||||
networks:
|
||||
default:
|
||||
driver: bridge
|
||||
@@ -8,37 +8,52 @@ services:
|
||||
image: opea/vllm-gaudi:latest
|
||||
container_name: vllm-gaudi-server
|
||||
ports:
|
||||
- "8008:80"
|
||||
- ${LLM_ENDPOINT_PORT:-8008}:80
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
HF_TOKEN: ${HF_TOKEN}
|
||||
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
HABANA_VISIBLE_DEVICES: all
|
||||
OMPI_MCA_btl_vader_single_copy_mechanism: none
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
|
||||
VLLM_TORCH_PROFILER_DIR: "/mnt"
|
||||
host_ip: ${host_ip}
|
||||
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
|
||||
VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
|
||||
runtime: habana
|
||||
cap_add:
|
||||
- SYS_NICE
|
||||
ipc: host
|
||||
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 10s
|
||||
retries: 100
|
||||
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq-len-to-capture ${MAX_INPUT_TOKENS}
|
||||
llm:
|
||||
image: opea/llm-docsum-vllm:latest
|
||||
container_name: llm-docsum-vllm-server
|
||||
image: opea/llm-docsum:latest
|
||||
container_name: llm-docsum-server
|
||||
depends_on:
|
||||
vllm-service:
|
||||
condition: service_healthy
|
||||
ports:
|
||||
- "9000:9000"
|
||||
- ${DOCSUM_PORT:-9000}:9000
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
vLLM_ENDPOINT: ${vLLM_ENDPOINT}
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
|
||||
MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
|
||||
LLM_ENDPOINT: ${LLM_ENDPOINT}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
|
||||
LOGFLAG: ${LOGFLAG:-False}
|
||||
restart: unless-stopped
|
||||
|
||||
networks:
|
||||
@@ -19,10 +19,10 @@ COPY comps /home/user/comps
|
||||
|
||||
RUN pip install --no-cache-dir --upgrade pip setuptools && \
|
||||
if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
|
||||
pip install --no-cache-dir -r /home/user/comps/llms/summarization/tgi/langchain/requirements.txt
|
||||
pip install --no-cache-dir -r /home/user/comps/llms/src/doc-summarization/requirements.txt
|
||||
|
||||
ENV PYTHONPATH=$PYTHONPATH:/home/user
|
||||
|
||||
WORKDIR /home/user/comps/llms/summarization/tgi/langchain
|
||||
WORKDIR /home/user/comps/llms/src/doc-summarization
|
||||
|
||||
ENTRYPOINT ["bash", "entrypoint.sh"]
|
||||
@@ -1,66 +1,44 @@
|
||||
# Document Summary TGI Microservice
|
||||
# Document Summary LLM Microservice
|
||||
|
||||
This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors.
|
||||
[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more.
|
||||
This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using Text Generation Inference on Intel Xeon and Gaudi2 processors. You can set backend service either [TGI](../../../third_parties/tgi) or [vLLM](../../../third_parties/vllm).
|
||||
|
||||
## 🚀1. Start Microservice with Python 🐍 (Option 1)
|
||||
## 🚀1. Start Microservice with Docker 🐳
|
||||
|
||||
To start the LLM microservice, you need to install python packages first.
|
||||
### 1.1 Setup Environment Variables
|
||||
|
||||
### 1.1 Install Requirements
|
||||
In order to start DocSum services, you need to setup the following environment variables first.
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 1.2 Start LLM Service
|
||||
|
||||
```bash
|
||||
export HF_TOKEN=${your_hf_api_token}
|
||||
docker run -p 8008:80 -v ./data:/data --name llm-docsum-tgi --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.1.0 --model-id ${your_hf_llm_model}
|
||||
```
|
||||
|
||||
### 1.3 Verify the TGI Service
|
||||
|
||||
```bash
|
||||
curl http://${your_ip}:8008/v1/chat/completions \
|
||||
-X POST \
|
||||
-d '{"model": ${your_hf_llm_model}, "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
### 1.4 Start LLM Service with Python Script
|
||||
|
||||
```bash
|
||||
export TGI_LLM_ENDPOINT="http://${your_ip}:8008"
|
||||
python llm.py
|
||||
```
|
||||
|
||||
## 🚀2. Start Microservice with Docker 🐳 (Option 2)
|
||||
|
||||
If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a TGI/vLLM service with docker.
|
||||
|
||||
### 2.1 Setup Environment Variables
|
||||
|
||||
In order to start TGI and LLM services, you need to setup the following environment variables first.
|
||||
|
||||
```bash
|
||||
export HF_TOKEN=${your_hf_api_token}
|
||||
export TGI_LLM_ENDPOINT="http://${your_ip}:8008"
|
||||
export host_ip=${your_host_ip}
|
||||
export LLM_ENDPOINT_PORT=8008
|
||||
export DOCSUM_PORT=9000
|
||||
export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
|
||||
export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
|
||||
export LLM_MODEL_ID=${your_hf_llm_model}
|
||||
export MAX_INPUT_TOKENS=2048
|
||||
export MAX_TOTAL_TOKENS=4096
|
||||
export DocSum_COMPONENT_NAME="OPEADocSum_TGI" # or "OPEADocSum_vLLM"
|
||||
```
|
||||
|
||||
Please make sure MAX_TOTAL_TOKENS should be larger than (MAX_INPUT_TOKENS + max_new_tokens + 50), 50 is reserved prompt length.
|
||||
|
||||
### 2.2 Build Docker Image
|
||||
### 1.2 Build Docker Image
|
||||
|
||||
Step 1: Prepare backend LLM docker image.
|
||||
|
||||
If you want to use vLLM backend, refer to [vLLM](../../../third_parties/vllm/src) to build vLLM docker images first.
|
||||
|
||||
No need for TGI.
|
||||
|
||||
Step 2: Build FaqGen docker image.
|
||||
|
||||
```bash
|
||||
cd ../../../../../
|
||||
docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/summarization/tgi/langchain/Dockerfile .
|
||||
cd ../../../../
|
||||
docker build -t opea/llm-docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/summarization/Dockerfile .
|
||||
```
|
||||
|
||||
### 1.3 Run Docker
|
||||
|
||||
To start a docker container, you have two options:
|
||||
|
||||
- A. Run Docker with CLI
|
||||
@@ -68,16 +46,45 @@ To start a docker container, you have two options:
|
||||
|
||||
You can choose one as needed.
|
||||
|
||||
### 2.3 Run Docker with CLI (Option A)
|
||||
### 1.3.1 Run Docker with CLI (Option A)
|
||||
|
||||
Step 1: Start the backend LLM service
|
||||
Please refer to [TGI](../../../third_parties/tgi/deployment/docker_compose/) or [vLLM](../../../third_parties/vllm/deployment/docker_compose/) guideline to start a backend LLM service.
|
||||
|
||||
Step 2: Start the DocSum microservices
|
||||
|
||||
```bash
|
||||
docker run -d --name="llm-docsum-tgi-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HF_TOKEN=$HF_TOKEN -e MAX_INPUT_TOKENS=${MAX_INPUT_TOKENS} -e MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} opea/llm-docsum-tgi:latest
|
||||
docker run -d \
|
||||
--name="llm-docsum-server" \
|
||||
-p 9000:9000 \
|
||||
--ipc=host \
|
||||
-e http_proxy=$http_proxy \
|
||||
-e https_proxy=$https_proxy \
|
||||
-e LLM_MODEL_ID=$LLM_MODEL_ID \
|
||||
-e LLM_ENDPOINT=$LLM_ENDPOINT \
|
||||
-e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN \
|
||||
-e DocSum_COMPONENT_NAME=$DocSum_COMPONENT_NAME \
|
||||
-e MAX_INPUT_TOKENS=${MAX_INPUT_TOKENS} \
|
||||
-e MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} \
|
||||
opea/llm-docsum:latest
|
||||
```
|
||||
|
||||
### 2.4 Run Docker with Docker Compose (Option B)
|
||||
### 1.3.2 Run Docker with Docker Compose (Option B)
|
||||
|
||||
```bash
|
||||
docker compose -f docker_compose_llm.yaml up -d
|
||||
cd ../../deployment/docker_compose/
|
||||
|
||||
# Backend is TGI on xeon
|
||||
docker compose -f doc-summarization_tgi.yaml up -d
|
||||
|
||||
# Backend is TGI on gaudi
|
||||
# docker compose -f doc-summarization_tgi_on_intel_hpu.yaml up -d
|
||||
|
||||
# Backend is vLLM on xeon
|
||||
# docker compose -f doc-summarization_vllm.yaml up -d
|
||||
|
||||
# Backend is vLLM on gaudi
|
||||
# docker compose -f doc-summarization_vllm_on_intel_hpu.yaml up -d
|
||||
```
|
||||
|
||||
## 🚀3. Consume LLM Service
|
||||
@@ -106,19 +113,19 @@ If you want to deal with long context, can select suitable summary type, details
|
||||
|
||||
```bash
|
||||
# Enable stream to receive a stream response. By default, this is set to True.
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
curl http://${your_ip}:9000/v1/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# Disable stream to receive a non-stream response.
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
curl http://${your_ip}:9000/v1/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# Use Chinese mode
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
curl http://${your_ip}:9000/v1/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"2024年9月26日,北京——今日,英特尔正式发布英特尔® 至强® 6性能核处理器(代号Granite Rapids),为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
@@ -139,7 +146,7 @@ In this mode LLM generate summary based on complete input text. In this case ple
|
||||
Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
|
||||
|
||||
```bash
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
curl http://${your_ip}:9000/v1/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}' \
|
||||
-H 'Content-Type: application/json'
|
||||
@@ -152,7 +159,7 @@ Map_reduce mode will split the inputs into multiple chunks, map each document to
|
||||
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
|
||||
|
||||
```bash
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
curl http://${your_ip}:9000/v1/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
@@ -165,7 +172,7 @@ Refin mode will split the inputs into multiple chunks, generate summary for the
|
||||
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`.
|
||||
|
||||
```bash
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
curl http://${your_ip}:9000/v1/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' \
|
||||
-H 'Content-Type: application/json'
|
||||
@@ -5,4 +5,4 @@
|
||||
|
||||
pip --no-cache-dir install -r requirements-runtime.txt
|
||||
|
||||
python llm.py
|
||||
python opea_docsum_microservice.py
|
||||
204
comps/llms/src/doc-summarization/integrations/common.py
Normal file
204
comps/llms/src/doc-summarization/integrations/common.py
Normal file
@@ -0,0 +1,204 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import os
|
||||
|
||||
import requests
|
||||
from fastapi.responses import StreamingResponse
|
||||
from langchain.chains.summarize import load_summarize_chain
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, ServiceType
|
||||
from comps.cores.mega.utils import ConfigError, get_access_token, load_model_configs
|
||||
|
||||
from .template import templ_en, templ_refine_en, templ_refine_zh, templ_zh
|
||||
|
||||
logger = CustomLogger("llm_docsum")
|
||||
logflag = os.getenv("LOGFLAG", False)
|
||||
|
||||
# Environment variables
|
||||
MODEL_NAME = os.getenv("LLM_MODEL_ID")
|
||||
MODEL_CONFIGS = os.getenv("MODEL_CONFIGS")
|
||||
TOKEN_URL = os.getenv("TOKEN_URL")
|
||||
CLIENTID = os.getenv("CLIENTID")
|
||||
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
|
||||
MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", 2048))
|
||||
MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS", 4096))
|
||||
|
||||
if os.getenv("LLM_ENDPOINT") is not None:
|
||||
DEFAULT_ENDPOINT = os.getenv("LLM_ENDPOINT")
|
||||
elif os.getenv("TGI_LLM_ENDPOINT") is not None:
|
||||
DEFAULT_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT")
|
||||
elif os.getenv("vLLM_ENDPOINT") is not None:
|
||||
DEFAULT_ENDPOINT = os.getenv("vLLM_ENDPOINT")
|
||||
else:
|
||||
DEFAULT_ENDPOINT = "http://localhost:8080"
|
||||
|
||||
|
||||
def get_llm_endpoint():
|
||||
if not MODEL_CONFIGS:
|
||||
return DEFAULT_ENDPOINT
|
||||
else:
|
||||
# Validate and Load the models config if MODEL_CONFIGS is not null
|
||||
configs_map = {}
|
||||
try:
|
||||
configs_map = load_model_configs(MODEL_CONFIGS)
|
||||
except ConfigError as e:
|
||||
logger.error(f"Failed to load model configurations: {e}")
|
||||
raise ConfigError(f"Failed to load model configurations: {e}")
|
||||
try:
|
||||
return configs_map.get(MODEL_NAME).get("endpoint")
|
||||
except ConfigError as e:
|
||||
logger.error(f"Input model {MODEL_NAME} not present in model_configs. Error {e}")
|
||||
raise ConfigError(f"Input model {MODEL_NAME} not present in model_configs")
|
||||
|
||||
|
||||
class OPEADocSum(OpeaComponent):
|
||||
"""A specialized OPEA DocSum component derived from OpeaComponent.
|
||||
|
||||
Attributes:
|
||||
client (TGI/vLLM): An instance of the TGI/vLLM client for text generation.
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, description: str, config: dict = None):
|
||||
super().__init__(name, ServiceType.LLM.name.lower(), description, config)
|
||||
self.access_token = (
|
||||
get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
|
||||
)
|
||||
self.llm_endpoint = get_llm_endpoint()
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
health_status = self.check_health()
|
||||
if not health_status:
|
||||
logger.error("OPEADocSum health check failed.")
|
||||
|
||||
async def generate(self, input: DocSumLLMParams, client):
|
||||
"""Invokes the TGI/vLLM LLM service to generate summarization for the provided input.
|
||||
|
||||
Args:
|
||||
input (DocSumLLMParams): The input text(s).
|
||||
client: TGI/vLLM based client
|
||||
"""
|
||||
### check summary type
|
||||
summary_types = ["auto", "stuff", "truncate", "map_reduce", "refine"]
|
||||
if input.summary_type not in summary_types:
|
||||
raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
|
||||
if input.summary_type == "auto": ### Check input token length in auto mode
|
||||
token_len = len(self.tokenizer.encode(input.query))
|
||||
if token_len > MAX_INPUT_TOKENS + 50:
|
||||
input.summary_type = "refine"
|
||||
if logflag:
|
||||
logger.info(
|
||||
f"Input token length {token_len} exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'refine' mode."
|
||||
)
|
||||
else:
|
||||
input.summary_type = "stuff"
|
||||
if logflag:
|
||||
logger.info(
|
||||
f"Input token length {token_len} not exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'stuff' mode."
|
||||
)
|
||||
|
||||
### Check input language
|
||||
if input.language in ["en", "auto"]:
|
||||
templ = templ_en
|
||||
templ_refine = templ_refine_en
|
||||
elif input.language in ["zh"]:
|
||||
templ = templ_zh
|
||||
templ_refine = templ_refine_zh
|
||||
else:
|
||||
raise NotImplementedError('Please specify the input language in "en", "zh", "auto"')
|
||||
|
||||
## Prompt
|
||||
PROMPT = PromptTemplate.from_template(templ)
|
||||
if input.summary_type == "refine":
|
||||
PROMPT_REFINE = PromptTemplate.from_template(templ_refine)
|
||||
if logflag:
|
||||
logger.info("After prompting:")
|
||||
logger.info(PROMPT)
|
||||
if input.summary_type == "refine":
|
||||
logger.info(PROMPT_REFINE)
|
||||
|
||||
## Split text
|
||||
if input.summary_type == "stuff":
|
||||
text_splitter = CharacterTextSplitter()
|
||||
else:
|
||||
if input.summary_type == "refine":
|
||||
if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128: ## 128 is reserved prompt length
|
||||
raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)")
|
||||
max_input_tokens = min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)
|
||||
else:
|
||||
if MAX_TOTAL_TOKENS <= input.max_tokens + 50: # 50 is reserved token length for prompt
|
||||
raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)")
|
||||
max_input_tokens = min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)
|
||||
chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens
|
||||
chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size)
|
||||
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
||||
tokenizer=self.tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
||||
)
|
||||
if logflag:
|
||||
logger.info(f"set chunk size to: {chunk_size}")
|
||||
logger.info(f"set chunk overlap to: {chunk_overlap}")
|
||||
|
||||
texts = text_splitter.split_text(input.query)
|
||||
docs = [Document(page_content=t) for t in texts]
|
||||
if logflag:
|
||||
logger.info(f"Split input query into {len(docs)} chunks")
|
||||
logger.info(f"The character length of the first chunk is {len(texts[0])}")
|
||||
|
||||
## LLM chain
|
||||
summary_type = input.summary_type
|
||||
if summary_type == "stuff":
|
||||
llm_chain = load_summarize_chain(llm=client, prompt=PROMPT)
|
||||
elif summary_type == "truncate":
|
||||
docs = [docs[0]]
|
||||
llm_chain = load_summarize_chain(llm=client, prompt=PROMPT)
|
||||
elif summary_type == "map_reduce":
|
||||
llm_chain = load_summarize_chain(
|
||||
llm=client,
|
||||
map_prompt=PROMPT,
|
||||
combine_prompt=PROMPT,
|
||||
chain_type="map_reduce",
|
||||
return_intermediate_steps=True,
|
||||
)
|
||||
elif summary_type == "refine":
|
||||
llm_chain = load_summarize_chain(
|
||||
llm=client,
|
||||
question_prompt=PROMPT,
|
||||
refine_prompt=PROMPT_REFINE,
|
||||
chain_type="refine",
|
||||
return_intermediate_steps=True,
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
|
||||
|
||||
if input.stream:
|
||||
|
||||
async def stream_generator():
|
||||
from langserve.serialization import WellKnownLCSerializer
|
||||
|
||||
_serializer = WellKnownLCSerializer()
|
||||
async for chunk in llm_chain.astream_log(docs):
|
||||
data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
|
||||
if logflag:
|
||||
logger.info(data)
|
||||
yield f"data: {data}\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
return StreamingResponse(stream_generator(), media_type="text/event-stream")
|
||||
else:
|
||||
response = await llm_chain.ainvoke(docs)
|
||||
|
||||
if input.summary_type in ["map_reduce", "refine"]:
|
||||
intermediate_steps = response["intermediate_steps"]
|
||||
if logflag:
|
||||
logger.info("intermediate_steps:")
|
||||
logger.info(intermediate_steps)
|
||||
|
||||
output_text = response["output_text"]
|
||||
if logflag:
|
||||
logger.info("\n\noutput_text:")
|
||||
logger.info(output_text)
|
||||
|
||||
return GeneratedDoc(text=output_text, prompt=input.query)
|
||||
58
comps/llms/src/doc-summarization/integrations/template.py
Normal file
58
comps/llms/src/doc-summarization/integrations/template.py
Normal file
@@ -0,0 +1,58 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
templ_en = """Write a concise summary of the following:
|
||||
|
||||
|
||||
"{text}"
|
||||
|
||||
|
||||
CONCISE SUMMARY:"""
|
||||
|
||||
templ_zh = """请简要概括以下内容:
|
||||
|
||||
|
||||
"{text}"
|
||||
|
||||
|
||||
概况:"""
|
||||
|
||||
|
||||
templ_refine_en = """Your job is to produce a final summary.
|
||||
We have provided an existing summary up to a certain point, then we will provide more context.
|
||||
You need to refine the existing summary (only if needed) with new context and generate a final summary.
|
||||
|
||||
|
||||
Existing Summary:
|
||||
"{existing_answer}"
|
||||
|
||||
|
||||
|
||||
New Context:
|
||||
"{text}"
|
||||
|
||||
|
||||
|
||||
Final Summary:
|
||||
|
||||
"""
|
||||
|
||||
templ_refine_zh = """\
|
||||
你的任务是生成一个最终摘要。
|
||||
我们已经处理好部分文本并生成初始摘要, 并提供了新的未处理文本
|
||||
你需要根据新提供的文本,结合初始摘要,生成一个最终摘要。
|
||||
|
||||
|
||||
初始摘要:
|
||||
"{existing_answer}"
|
||||
|
||||
|
||||
|
||||
新的文本:
|
||||
"{text}"
|
||||
|
||||
|
||||
|
||||
最终摘要:
|
||||
|
||||
"""
|
||||
76
comps/llms/src/doc-summarization/integrations/tgi.py
Normal file
76
comps/llms/src/doc-summarization/integrations/tgi.py
Normal file
@@ -0,0 +1,76 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import os
|
||||
|
||||
import requests
|
||||
from langchain_community.llms import HuggingFaceEndpoint
|
||||
|
||||
from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
|
||||
|
||||
from .common import *
|
||||
|
||||
logger = CustomLogger("llm_docsum_tgi")
|
||||
logflag = os.getenv("LOGFLAG", False)
|
||||
|
||||
|
||||
@OpeaComponentRegistry.register("OPEADocSum_TGI")
|
||||
class OPEADocSum_TGI(OPEADocSum):
|
||||
"""A specialized OPEA DocSum TGI component derived from OPEADocSum for interacting with TGI services based on Lanchain HuggingFaceEndpoint API.
|
||||
|
||||
Attributes:
|
||||
client (TGI): An instance of the TGI client for text generation.
|
||||
"""
|
||||
|
||||
def check_health(self) -> bool:
|
||||
"""Checks the health of the TGI LLM service.
|
||||
|
||||
Returns:
|
||||
bool: True if the service is reachable and healthy, False otherwise.
|
||||
"""
|
||||
|
||||
try:
|
||||
# response = requests.get(f"{self.llm_endpoint}/health")
|
||||
|
||||
# Will remove after TGI gaudi fix health bug
|
||||
url = f"{self.llm_endpoint}/generate"
|
||||
data = {"inputs": "What is Deep Learning?", "parameters": {"max_new_tokens": 17}}
|
||||
headers = {"Content-Type": "application/json"}
|
||||
response = requests.post(url=url, json=data, headers=headers)
|
||||
|
||||
if response.status_code == 200:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error("Health check failed")
|
||||
return False
|
||||
|
||||
async def invoke(self, input: DocSumLLMParams):
|
||||
"""Invokes the TGI LLM service to generate summarization output for the provided input.
|
||||
|
||||
Args:
|
||||
input (DocSumLLMParams): The input text(s).
|
||||
"""
|
||||
server_kwargs = {}
|
||||
if self.access_token:
|
||||
server_kwargs["headers"] = {"Authorization": f"Bearer {self.access_token}"}
|
||||
|
||||
if input.stream and input.summary_type == "map_reduce":
|
||||
logger.info("Map Reduce mode don't support stream=True, set to stream=False")
|
||||
input.stream = False
|
||||
self.client = HuggingFaceEndpoint(
|
||||
endpoint_url=self.llm_endpoint,
|
||||
max_new_tokens=input.max_tokens,
|
||||
top_k=input.top_k,
|
||||
top_p=input.top_p,
|
||||
typical_p=input.typical_p,
|
||||
temperature=input.temperature,
|
||||
repetition_penalty=input.repetition_penalty,
|
||||
streaming=input.stream,
|
||||
server_kwargs=server_kwargs,
|
||||
)
|
||||
result = await self.generate(input, self.client)
|
||||
|
||||
return result
|
||||
69
comps/llms/src/doc-summarization/integrations/vllm.py
Normal file
69
comps/llms/src/doc-summarization/integrations/vllm.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import os
|
||||
|
||||
import requests
|
||||
from langchain_community.llms import VLLMOpenAI
|
||||
|
||||
from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
|
||||
|
||||
from .common import *
|
||||
|
||||
logger = CustomLogger("llm_docsum_vllm")
|
||||
logflag = os.getenv("LOGFLAG", False)
|
||||
|
||||
|
||||
@OpeaComponentRegistry.register("OPEADocSum_vLLM")
|
||||
class OPEADocSum_vLLM(OPEADocSum):
|
||||
"""A specialized OPEA DocSum vLLM component derived from OPEADocSum for interacting with vLLM services based on Lanchain VLLMOpenAI API.
|
||||
|
||||
Attributes:
|
||||
client (vLLM): An instance of the vLLM client for text generation.
|
||||
"""
|
||||
|
||||
def check_health(self) -> bool:
|
||||
"""Checks the health of the vLLM LLM service.
|
||||
|
||||
Returns:
|
||||
bool: True if the service is reachable and healthy, False otherwise.
|
||||
"""
|
||||
|
||||
try:
|
||||
response = requests.get(f"{self.llm_endpoint}/health")
|
||||
if response.status_code == 200:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error("Health check failed")
|
||||
return False
|
||||
|
||||
async def invoke(self, input: DocSumLLMParams):
|
||||
"""Invokes the vLLM LLM service to generate summarization output for the provided input.
|
||||
|
||||
Args:
|
||||
input (DocSumLLMParams): The input text(s).
|
||||
"""
|
||||
headers = {}
|
||||
if self.access_token:
|
||||
headers = {"Authorization": f"Bearer {self.access_token}"}
|
||||
|
||||
if input.stream and input.summary_type == "map_reduce":
|
||||
logger.info("Map Reduce mode don't support stream=True, set to stream=False")
|
||||
input.stream = False
|
||||
self.client = VLLMOpenAI(
|
||||
openai_api_key="EMPTY",
|
||||
openai_api_base=self.llm_endpoint + "/v1",
|
||||
model_name=MODEL_NAME,
|
||||
default_headers=headers,
|
||||
max_tokens=input.max_tokens,
|
||||
top_p=input.top_p,
|
||||
streaming=input.stream,
|
||||
temperature=input.temperature,
|
||||
presence_penalty=input.repetition_penalty,
|
||||
)
|
||||
result = await self.generate(input, self.client)
|
||||
|
||||
return result
|
||||
58
comps/llms/src/doc-summarization/opea_docsum_microservice.py
Normal file
58
comps/llms/src/doc-summarization/opea_docsum_microservice.py
Normal file
@@ -0,0 +1,58 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
from integrations.tgi import OPEADocSum_TGI
|
||||
from integrations.vllm import OPEADocSum_vLLM
|
||||
|
||||
from comps import (
|
||||
CustomLogger,
|
||||
DocSumLLMParams,
|
||||
OpeaComponentLoader,
|
||||
ServiceType,
|
||||
opea_microservices,
|
||||
register_microservice,
|
||||
register_statistics,
|
||||
statistics_dict,
|
||||
)
|
||||
|
||||
logger = CustomLogger("llm_docsum")
|
||||
logflag = os.getenv("LOGFLAG", False)
|
||||
|
||||
llm_component_name = os.getenv("DocSum_COMPONENT_NAME", "OPEADocSum_TGI")
|
||||
# Initialize OpeaComponentLoader
|
||||
loader = OpeaComponentLoader(llm_component_name, description=f"OPEA LLM DocSum Component: {llm_component_name}")
|
||||
|
||||
|
||||
@register_microservice(
|
||||
name="opea_service@llm_docsum",
|
||||
service_type=ServiceType.LLM,
|
||||
endpoint="/v1/docsum",
|
||||
host="0.0.0.0",
|
||||
port=9000,
|
||||
)
|
||||
@register_statistics(names=["opea_service@llm_docsum"])
|
||||
async def llm_generate(input: DocSumLLMParams):
|
||||
start = time.time()
|
||||
|
||||
# Log the input if logging is enabled
|
||||
if logflag:
|
||||
logger.info(input)
|
||||
|
||||
try:
|
||||
# Use the controller to invoke the active component
|
||||
response = await loader.invoke(input)
|
||||
# Record statistics
|
||||
statistics_dict["opea_service@llm_docsum"].append_latency(time.time() - start, None)
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during DocSum invocation: {e}")
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info("OPEA DocSum Microservice is starting...")
|
||||
opea_microservices["opea_service@llm_docsum"].start()
|
||||
@@ -1,37 +0,0 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
tgi_service:
|
||||
image: ghcr.io/huggingface/text-generation-inference:2.1.0
|
||||
container_name: tgi-service
|
||||
ports:
|
||||
- "8008:80"
|
||||
volumes:
|
||||
- "./data:/data"
|
||||
environment:
|
||||
HF_TOKEN: ${HF_TOKEN}
|
||||
shm_size: 1g
|
||||
command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
|
||||
llm:
|
||||
image: opea/llm-docsum-tgi:latest
|
||||
container_name: llm-docsum-tgi-server
|
||||
ports:
|
||||
- "9000:9000"
|
||||
ipc: host
|
||||
environment:
|
||||
no_proxy: ${no_proxy}
|
||||
http_proxy: ${http_proxy}
|
||||
https_proxy: ${https_proxy}
|
||||
TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
|
||||
HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
|
||||
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
|
||||
MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
|
||||
LLM_MODEL_ID: ${LLM_MODEL_ID}
|
||||
restart: unless-stopped
|
||||
|
||||
networks:
|
||||
default:
|
||||
driver: bridge
|
||||
@@ -1,8 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
pip --no-cache-dir install -r requirements-runtime.txt
|
||||
|
||||
python llm.py
|
||||
@@ -1,245 +0,0 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import os
|
||||
|
||||
from fastapi.responses import StreamingResponse
|
||||
from langchain.chains.summarize import load_summarize_chain
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.prompts import PromptTemplate
|
||||
from langchain_community.llms import HuggingFaceEndpoint
|
||||
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, ServiceType, opea_microservices, register_microservice
|
||||
from comps.cores.mega.utils import get_access_token
|
||||
|
||||
logger = CustomLogger("llm_docsum")
|
||||
logflag = os.getenv("LOGFLAG", False)
|
||||
|
||||
# Environment variables
|
||||
TOKEN_URL = os.getenv("TOKEN_URL")
|
||||
CLIENTID = os.getenv("CLIENTID")
|
||||
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
|
||||
MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", 2048))
|
||||
MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS", 4096))
|
||||
LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "Intel/neural-chat-7b-v3-3")
|
||||
|
||||
templ_en = """Write a concise summary of the following:
|
||||
|
||||
|
||||
"{text}"
|
||||
|
||||
|
||||
CONCISE SUMMARY:"""
|
||||
|
||||
templ_zh = """请简要概括以下内容:
|
||||
|
||||
|
||||
"{text}"
|
||||
|
||||
|
||||
概况:"""
|
||||
|
||||
|
||||
templ_refine_en = """Your job is to produce a final summary.
|
||||
We have provided an existing summary up to a certain point, then we will provide more context.
|
||||
You need to refine the existing summary (only if needed) with new context and generate a final summary.
|
||||
|
||||
|
||||
Existing Summary:
|
||||
"{existing_answer}"
|
||||
|
||||
|
||||
|
||||
New Context:
|
||||
"{text}"
|
||||
|
||||
|
||||
|
||||
Final Summary:
|
||||
|
||||
"""
|
||||
|
||||
templ_refine_zh = """\
|
||||
你的任务是生成一个最终摘要。
|
||||
我们已经处理好部分文本并生成初始摘要, 并提供了新的未处理文本
|
||||
你需要根据新提供的文本,结合初始摘要,生成一个最终摘要。
|
||||
|
||||
|
||||
初始摘要:
|
||||
"{existing_answer}"
|
||||
|
||||
|
||||
|
||||
新的文本:
|
||||
"{text}"
|
||||
|
||||
|
||||
|
||||
最终摘要:
|
||||
|
||||
"""
|
||||
|
||||
|
||||
@register_microservice(
|
||||
name="opea_service@llm_docsum",
|
||||
service_type=ServiceType.LLM,
|
||||
endpoint="/v1/chat/docsum",
|
||||
host="0.0.0.0",
|
||||
port=9000,
|
||||
)
|
||||
async def llm_generate(input: DocSumLLMParams):
|
||||
if logflag:
|
||||
logger.info(input)
|
||||
|
||||
### check summary type
|
||||
summary_types = ["auto", "stuff", "truncate", "map_reduce", "refine"]
|
||||
if input.summary_type not in summary_types:
|
||||
raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
|
||||
if input.summary_type == "auto": ### Check input token length in auto mode
|
||||
token_len = len(tokenizer.encode(input.query))
|
||||
if token_len > MAX_INPUT_TOKENS + 50:
|
||||
input.summary_type = "refine"
|
||||
if logflag:
|
||||
logger.info(
|
||||
f"Input token length {token_len} exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'refine' mode."
|
||||
)
|
||||
else:
|
||||
input.summary_type = "stuff"
|
||||
if logflag:
|
||||
logger.info(
|
||||
f"Input token length {token_len} not exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'stuff' mode."
|
||||
)
|
||||
|
||||
if input.language in ["en", "auto"]:
|
||||
templ = templ_en
|
||||
templ_refine = templ_refine_en
|
||||
elif input.language in ["zh"]:
|
||||
templ = templ_zh
|
||||
templ_refine = templ_refine_zh
|
||||
else:
|
||||
raise NotImplementedError('Please specify the input language in "en", "zh", "auto"')
|
||||
|
||||
## Prompt
|
||||
PROMPT = PromptTemplate.from_template(templ)
|
||||
if input.summary_type == "refine":
|
||||
PROMPT_REFINE = PromptTemplate.from_template(templ_refine)
|
||||
if logflag:
|
||||
logger.info("After prompting:")
|
||||
logger.info(PROMPT)
|
||||
if input.summary_type == "refine":
|
||||
logger.info(PROMPT_REFINE)
|
||||
|
||||
## Split text
|
||||
if input.summary_type == "stuff":
|
||||
text_splitter = CharacterTextSplitter()
|
||||
else:
|
||||
if input.summary_type == "refine":
|
||||
if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128:
|
||||
raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)")
|
||||
max_input_tokens = min(
|
||||
MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS
|
||||
) # 128 is reserved token length for prompt
|
||||
else:
|
||||
if MAX_TOTAL_TOKENS <= input.max_tokens + 50:
|
||||
raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)")
|
||||
max_input_tokens = min(
|
||||
MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS
|
||||
) # 50 is reserved token length for prompt
|
||||
chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens
|
||||
chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size)
|
||||
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
||||
tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
||||
)
|
||||
if logflag:
|
||||
logger.info(f"set chunk size to: {chunk_size}")
|
||||
logger.info(f"set chunk overlap to: {chunk_overlap}")
|
||||
|
||||
texts = text_splitter.split_text(input.query)
|
||||
docs = [Document(page_content=t) for t in texts]
|
||||
if logflag:
|
||||
logger.info(f"Split input query into {len(docs)} chunks")
|
||||
logger.info(f"The character length of the first chunk is {len(texts[0])}")
|
||||
|
||||
## Access auth
|
||||
access_token = (
|
||||
get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
|
||||
)
|
||||
server_kwargs = {}
|
||||
if access_token:
|
||||
server_kwargs["headers"] = {"Authorization": f"Bearer {access_token}"}
|
||||
|
||||
## LLM
|
||||
if input.stream and input.summary_type == "map_reduce":
|
||||
logger.info("Map Reduce mode don't support stream=True, set to stream=False")
|
||||
input.stream = False
|
||||
llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
|
||||
llm = HuggingFaceEndpoint(
|
||||
endpoint_url=llm_endpoint,
|
||||
max_new_tokens=input.max_tokens,
|
||||
top_k=input.top_k,
|
||||
top_p=input.top_p,
|
||||
typical_p=input.typical_p,
|
||||
temperature=input.temperature,
|
||||
repetition_penalty=input.repetition_penalty,
|
||||
streaming=input.stream,
|
||||
server_kwargs=server_kwargs,
|
||||
)
|
||||
|
||||
## LLM chain
|
||||
summary_type = input.summary_type
|
||||
if summary_type == "stuff":
|
||||
llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
|
||||
elif summary_type == "truncate":
|
||||
docs = [docs[0]]
|
||||
llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
|
||||
elif summary_type == "map_reduce":
|
||||
llm_chain = load_summarize_chain(
|
||||
llm=llm, map_prompt=PROMPT, combine_prompt=PROMPT, chain_type="map_reduce", return_intermediate_steps=True
|
||||
)
|
||||
elif summary_type == "refine":
|
||||
llm_chain = load_summarize_chain(
|
||||
llm=llm,
|
||||
question_prompt=PROMPT,
|
||||
refine_prompt=PROMPT_REFINE,
|
||||
chain_type="refine",
|
||||
return_intermediate_steps=True,
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')
|
||||
|
||||
if input.stream:
|
||||
|
||||
async def stream_generator():
|
||||
from langserve.serialization import WellKnownLCSerializer
|
||||
|
||||
_serializer = WellKnownLCSerializer()
|
||||
async for chunk in llm_chain.astream_log(docs):
|
||||
data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
|
||||
if logflag:
|
||||
logger.info(data)
|
||||
yield f"data: {data}\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
return StreamingResponse(stream_generator(), media_type="text/event-stream")
|
||||
else:
|
||||
response = await llm_chain.ainvoke(docs)
|
||||
|
||||
if input.summary_type in ["map_reduce", "refine"]:
|
||||
intermediate_steps = response["intermediate_steps"]
|
||||
if logflag:
|
||||
logger.info("intermediate_steps:")
|
||||
logger.info(intermediate_steps)
|
||||
|
||||
output_text = response["output_text"]
|
||||
if logflag:
|
||||
logger.info("\n\noutput_text:")
|
||||
logger.info(output_text)
|
||||
|
||||
return GeneratedDoc(text=output_text, prompt=input.query)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
|
||||
opea_microservices["opea_service@llm_docsum"].start()
|
||||
@@ -1,28 +0,0 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
FROM python:3.11-slim
|
||||
|
||||
ARG ARCH="cpu"
|
||||
|
||||
RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
|
||||
libgl1-mesa-glx \
|
||||
libjemalloc-dev
|
||||
|
||||
RUN useradd -m -s /bin/bash user && \
|
||||
mkdir -p /home/user && \
|
||||
chown -R user /home/user/
|
||||
|
||||
USER user
|
||||
|
||||
COPY comps /home/user/comps
|
||||
|
||||
RUN pip install --no-cache-dir --upgrade pip setuptools && \
|
||||
if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
|
||||
pip install --no-cache-dir -r /home/user/comps/llms/summarization/vllm/langchain/requirements.txt
|
||||
|
||||
ENV PYTHONPATH=$PYTHONPATH:/home/user
|
||||
|
||||
WORKDIR /home/user/comps/llms/summarization/vllm/langchain
|
||||
|
||||
ENTRYPOINT ["bash", "entrypoint.sh"]
|
||||
@@ -1,171 +0,0 @@
|
||||
# Document Summary vLLM Microservice
|
||||
|
||||
This microservice leverages LangChain to implement summarization strategies and facilitate LLM inference using vLLM.
|
||||
[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference and serving, it delivers state-of-the-art serving throughput with a set of advanced features such as PagedAttention, Continuous batching and etc.. Besides GPUs, vLLM already supported [Intel CPUs](https://www.intel.com/content/www/us/en/products/overview.html) and [Gaudi accelerators](https://habana.ai/products).
|
||||
|
||||
## 🚀1. Start Microservice with Python 🐍 (Option 1)
|
||||
|
||||
To start the LLM microservice, you need to install python packages first.
|
||||
|
||||
### 1.1 Install Requirements
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 1.2 Start LLM Service
|
||||
|
||||
```bash
|
||||
export HF_TOKEN=${your_hf_api_token}
|
||||
export LLM_MODEL_ID=${your_hf_llm_model}
|
||||
docker run -p 8008:80 -v ./data:/data --name llm-docsum-vllm --shm-size 1g opea/vllm-gaudi:latest --model-id ${LLM_MODEL_ID}
|
||||
```
|
||||
|
||||
### 1.3 Verify the vLLM Service
|
||||
|
||||
```bash
|
||||
curl http://${your_ip}:8008/v1/chat/completions \
|
||||
-X POST \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "user", "content": "What is Deep Learning? "}]}'
|
||||
```
|
||||
|
||||
### 1.4 Start LLM Service with Python Script
|
||||
|
||||
```bash
|
||||
export vLLM_ENDPOINT="http://${your_ip}:8008"
|
||||
python llm.py
|
||||
```
|
||||
|
||||
## 🚀2. Start Microservice with Docker 🐳 (Option 2)
|
||||
|
||||
If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a vLLM/vLLM service with docker.
|
||||
|
||||
To setup or build the vLLM image follow the instructions provided in [vLLM Gaudi](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation/vllm/langchain#22-vllm-on-gaudi)
|
||||
|
||||
### 2.1 Setup Environment Variables
|
||||
|
||||
In order to start vLLM and LLM services, you need to setup the following environment variables first.
|
||||
|
||||
```bash
|
||||
export HF_TOKEN=${your_hf_api_token}
|
||||
export vLLM_ENDPOINT="http://${your_ip}:8008"
|
||||
export LLM_MODEL_ID=${your_hf_llm_model}
|
||||
```
|
||||
|
||||
### 2.2 Build Docker Image
|
||||
|
||||
```bash
|
||||
cd ../../../../../
|
||||
docker build -t opea/llm-docsum-vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/summarization/vllm/langchain/Dockerfile .
|
||||
```
|
||||
|
||||
To start a docker container, you have two options:
|
||||
|
||||
- A. Run Docker with CLI
|
||||
- B. Run Docker with Docker Compose
|
||||
|
||||
You can choose one as needed.
|
||||
|
||||
### 2.3 Run Docker with CLI (Option A)
|
||||
|
||||
```bash
|
||||
docker run -d --name="llm-docsum-vllm-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e vLLM_ENDPOINT=$vLLM_ENDPOINT -e HF_TOKEN=$HF_TOKEN opea/llm-docsum-vllm:latest
|
||||
```
|
||||
|
||||
### 2.4 Run Docker with Docker Compose (Option B)
|
||||
|
||||
```bash
|
||||
docker compose -f docker_compose_llm.yaml up -d
|
||||
```
|
||||
|
||||
## 🚀3. Consume LLM Service
|
||||
|
||||
### 3.1 Check Service Status
|
||||
|
||||
```bash
|
||||
curl http://${your_ip}:9000/v1/health_check\
|
||||
-X GET \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
### 3.2 Consume LLM Service
|
||||
|
||||
In DocSum microservice, except for basic LLM parameters, we also support several optimization parameters setting.
|
||||
|
||||
- "language": specify the language, can be "auto", "en", "zh", default is "auto"
|
||||
|
||||
If you want to deal with long context, can select suitable summary type, details in section 3.2.2.
|
||||
|
||||
- "summary_type": can be "auto", "stuff", "truncate", "map_reduce", "refine", default is "auto"
|
||||
- "chunk_size": max token length for each chunk. Set to be different default value according to "summary_type".
|
||||
- "chunk_overlap": overlap token length between each chunk, default is 0.1\*chunk_size
|
||||
|
||||
#### 3.2.1 Basic usage
|
||||
|
||||
```bash
|
||||
# Enable stream to receive a stream response. By default, this is set to True.
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# Disable stream to receive a non-stream response.
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# Use Chinese mode
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"2024年9月26日,北京——今日,英特尔正式发布英特尔® 至强® 6性能核处理器(代号Granite Rapids),为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
#### 3.2.2 Long context summarization with "summary_type"
|
||||
|
||||
**summary_type=auto**
|
||||
|
||||
"summary_type" is set to be "auto" by default, in this mode we will check input token length, if it exceed `MAX_INPUT_TOKENS`, `summary_type` will automatically be set to `refine` mode, otherwise will be set to `stuff` mode.
|
||||
|
||||
**summary_type=stuff**
|
||||
|
||||
In this mode LLM generate summary based on complete input text. In this case please carefully set `MAX_INPUT_TOKENS` and `MAX_TOTAL_TOKENS` according to your model and device memory, otherwise it may exceed LLM context limit and raise error when meet long context.
|
||||
|
||||
**summary_type=truncate**
|
||||
|
||||
Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
|
||||
|
||||
```bash
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "truncate", "chunk_size": 2000}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
**summary_type=map_reduce**
|
||||
|
||||
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here.
|
||||
|
||||
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
|
||||
|
||||
```bash
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
**summary_type=refine**
|
||||
|
||||
Refin mode will split the inputs into multiple chunks, generate summary for the first one, then combine with the second, loops over every remaining chunks to get the final summary.
|
||||
|
||||
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`.
|
||||
|
||||
```bash
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
@@ -1,2 +0,0 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
@@ -1,247 +0,0 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import os
|
||||
from pathlib import Path as p
|
||||
|
||||
from fastapi.responses import StreamingResponse
|
||||
from langchain.chains.summarize import load_summarize_chain
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.prompts import PromptTemplate
|
||||
from langchain_community.llms import VLLMOpenAI
|
||||
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from comps import CustomLogger, DocSumLLMParams, GeneratedDoc, ServiceType, opea_microservices, register_microservice
|
||||
from comps.cores.mega.utils import get_access_token
|
||||
|
||||
logger = CustomLogger("llm_docsum")
|
||||
logflag = os.getenv("LOGFLAG", False)
|
||||
|
||||
# Environment variables
|
||||
TOKEN_URL = os.getenv("TOKEN_URL")
|
||||
CLIENTID = os.getenv("CLIENTID")
|
||||
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
|
||||
MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS"))
|
||||
MAX_TOTAL_TOKENS = int(os.getenv("MAX_TOTAL_TOKENS"))
|
||||
LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", None)
|
||||
|
||||
templ_en = """Write a concise summary of the following:
|
||||
|
||||
|
||||
"{text}"
|
||||
|
||||
|
||||
CONCISE SUMMARY:"""
|
||||
|
||||
templ_zh = """请简要概括以下内容:
|
||||
|
||||
|
||||
"{text}"
|
||||
|
||||
|
||||
概况:"""
|
||||
|
||||
|
||||
templ_refine_en = """Your job is to produce a final summary.
|
||||
We have provided an existing summary up to a certain point, then we will provide more context.
|
||||
You need to refine the existing summary (only if needed) with new context and generate a final summary.
|
||||
|
||||
|
||||
Existing Summary:
|
||||
"{existing_answer}"
|
||||
|
||||
|
||||
|
||||
New Context:
|
||||
"{text}"
|
||||
|
||||
|
||||
|
||||
Final Summary:
|
||||
|
||||
"""
|
||||
|
||||
templ_refine_zh = """\
|
||||
你的任务是生成一个最终摘要。
|
||||
我们已经处理好部分文本并生成初始摘要, 并提供了新的未处理文本
|
||||
你需要根据新提供的文本,结合初始摘要,生成一个最终摘要。
|
||||
|
||||
|
||||
初始摘要:
|
||||
"{existing_answer}"
|
||||
|
||||
|
||||
|
||||
新的文本:
|
||||
"{text}"
|
||||
|
||||
|
||||
|
||||
最终摘要:
|
||||
|
||||
"""
|
||||
|
||||
|
||||
@register_microservice(
|
||||
name="opea_service@llm_docsum",
|
||||
service_type=ServiceType.LLM,
|
||||
endpoint="/v1/chat/docsum",
|
||||
host="0.0.0.0",
|
||||
port=9000,
|
||||
)
|
||||
async def llm_generate(input: DocSumLLMParams):
|
||||
if logflag:
|
||||
logger.info(input)
|
||||
|
||||
### check summary type
|
||||
summary_types = ["auto", "stuff", "truncate", "map_reduce", "refine"]
|
||||
if input.summary_type not in summary_types:
|
||||
raise NotImplementedError(f"Please specify the summary_type in {summary_types}")
|
||||
if input.summary_type == "auto": ### Check input token length in auto mode
|
||||
token_len = len(tokenizer.encode(input.query))
|
||||
if token_len > MAX_INPUT_TOKENS + 50:
|
||||
input.summary_type = "refine"
|
||||
if logflag:
|
||||
logger.info(
|
||||
f"Input token length {token_len} exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'refine' mode."
|
||||
)
|
||||
else:
|
||||
input.summary_type = "stuff"
|
||||
if logflag:
|
||||
logger.info(
|
||||
f"Input token length {token_len} not exceed MAX_INPUT_TOKENS + 50 {MAX_INPUT_TOKENS+50}, auto switch to 'stuff' mode."
|
||||
)
|
||||
|
||||
if input.language in ["en", "auto"]:
|
||||
templ = templ_en
|
||||
templ_refine = templ_refine_en
|
||||
elif input.language in ["zh"]:
|
||||
templ = templ_zh
|
||||
templ_refine = templ_refine_zh
|
||||
else:
|
||||
raise NotImplementedError('Please specify the input language in "en", "zh", "auto"')
|
||||
|
||||
## Prompt
|
||||
PROMPT = PromptTemplate.from_template(templ)
|
||||
if input.summary_type == "refine":
|
||||
PROMPT_REFINE = PromptTemplate.from_template(templ_refine)
|
||||
if logflag:
|
||||
logger.info("After prompting:")
|
||||
logger.info(PROMPT)
|
||||
if input.summary_type == "refine":
|
||||
logger.info(PROMPT_REFINE)
|
||||
|
||||
## Split text
|
||||
if input.summary_type == "stuff":
|
||||
text_splitter = CharacterTextSplitter()
|
||||
else:
|
||||
if input.summary_type == "refine":
|
||||
if MAX_TOTAL_TOKENS <= 2 * input.max_tokens + 128:
|
||||
raise RuntimeError("In Refine mode, Please set MAX_TOTAL_TOKENS larger than (max_tokens * 2 + 128)")
|
||||
max_input_tokens = min(
|
||||
MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS
|
||||
) # 128 is reserved token length for prompt
|
||||
else:
|
||||
if MAX_TOTAL_TOKENS <= input.max_tokens + 50:
|
||||
raise RuntimeError("Please set MAX_TOTAL_TOKENS larger than max_tokens + 50)")
|
||||
max_input_tokens = min(
|
||||
MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS
|
||||
) # 50 is reserved token length for prompt
|
||||
chunk_size = min(input.chunk_size, max_input_tokens) if input.chunk_size > 0 else max_input_tokens
|
||||
chunk_overlap = input.chunk_overlap if input.chunk_overlap > 0 else int(0.1 * chunk_size)
|
||||
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
||||
tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
||||
)
|
||||
if logflag:
|
||||
logger.info(f"set chunk size to: {chunk_size}")
|
||||
logger.info(f"set chunk overlap to: {chunk_overlap}")
|
||||
|
||||
texts = text_splitter.split_text(input.query)
|
||||
docs = [Document(page_content=t) for t in texts]
|
||||
if logflag:
|
||||
logger.info(f"Split input query into {len(docs)} chunks")
|
||||
logger.info(f"The character length of the first chunk is {len(texts[0])}")
|
||||
|
||||
## Access auth
|
||||
access_token = (
|
||||
get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
|
||||
)
|
||||
headers = {}
|
||||
if access_token:
|
||||
headers = {"Authorization": f"Bearer {access_token}"}
|
||||
|
||||
## LLM
|
||||
if input.stream and input.summary_type == "map_reduce":
|
||||
logger.info("Map Reduce mode don't support stream=True, set to stream=False")
|
||||
input.stream = False
|
||||
llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8080")
|
||||
model = input.model if input.model else os.getenv("LLM_MODEL_ID")
|
||||
llm = VLLMOpenAI(
|
||||
openai_api_key="EMPTY",
|
||||
openai_api_base=llm_endpoint + "/v1",
|
||||
model_name=model,
|
||||
default_headers=headers,
|
||||
max_tokens=input.max_tokens,
|
||||
top_p=input.top_p,
|
||||
streaming=input.stream,
|
||||
temperature=input.temperature,
|
||||
presence_penalty=input.repetition_penalty,
|
||||
)
|
||||
|
||||
## LLM chain
|
||||
summary_type = input.summary_type
|
||||
if summary_type == "stuff":
|
||||
llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
|
||||
elif summary_type == "truncate":
|
||||
docs = [docs[0]]
|
||||
llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
|
||||
elif summary_type == "map_reduce":
|
||||
llm_chain = load_summarize_chain(
|
||||
llm=llm, map_prompt=PROMPT, combine_prompt=PROMPT, chain_type="map_reduce", return_intermediate_steps=True
|
||||
)
|
||||
elif summary_type == "refine":
|
||||
llm_chain = load_summarize_chain(
|
||||
llm=llm,
|
||||
question_prompt=PROMPT,
|
||||
refine_prompt=PROMPT_REFINE,
|
||||
chain_type="refine",
|
||||
return_intermediate_steps=True,
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')
|
||||
|
||||
if input.stream:
|
||||
|
||||
async def stream_generator():
|
||||
from langserve.serialization import WellKnownLCSerializer
|
||||
|
||||
_serializer = WellKnownLCSerializer()
|
||||
async for chunk in llm_chain.astream_log(docs):
|
||||
data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
|
||||
if logflag:
|
||||
logger.info(data)
|
||||
yield f"data: {data}\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
return StreamingResponse(stream_generator(), media_type="text/event-stream")
|
||||
else:
|
||||
response = await llm_chain.ainvoke(docs)
|
||||
|
||||
if input.summary_type in ["map_reduce", "refine"]:
|
||||
intermediate_steps = response["intermediate_steps"]
|
||||
if logflag:
|
||||
logger.info("intermediate_steps:")
|
||||
logger.info(intermediate_steps)
|
||||
|
||||
output_text = response["output_text"]
|
||||
if logflag:
|
||||
logger.info("\n\noutput_text:")
|
||||
logger.info(output_text)
|
||||
|
||||
return GeneratedDoc(text=output_text, prompt=input.query)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
|
||||
opea_microservices["opea_service@llm_docsum"].start()
|
||||
@@ -1 +0,0 @@
|
||||
langserve
|
||||
@@ -1,16 +0,0 @@
|
||||
docarray[full]
|
||||
fastapi
|
||||
httpx==0.27.2
|
||||
huggingface_hub
|
||||
langchain #==0.1.12
|
||||
langchain-huggingface
|
||||
langchain-openai
|
||||
langchain_community
|
||||
langchainhub
|
||||
opentelemetry-api
|
||||
opentelemetry-exporter-otlp
|
||||
opentelemetry-sdk
|
||||
prometheus-fastapi-instrumentator
|
||||
shortuuid
|
||||
transformers
|
||||
uvicorn
|
||||
Reference in New Issue
Block a user