Support Long context for DocSum (#1255)

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: lkk <33276950+lkk12014402@users.noreply.github.com>
This commit is contained in:
XinyaoWa
2024-12-20 19:17:10 +08:00
committed by GitHub
parent 05365b6140
commit 50dd959d60
15 changed files with 861 additions and 267 deletions

View File

@@ -27,7 +27,7 @@ services:
security_opt:
- seccomp:unconfined
ipc: host
command: --model-id ${DOCSUM_LLM_MODEL_ID}
command: --model-id ${DOCSUM_LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
docsum-llm-server:
image: ${REGISTRY:-opea}/llm-docsum-tgi:${TAG:-latest}
@@ -53,6 +53,9 @@ services:
https_proxy: ${https_proxy}
TGI_LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
restart: unless-stopped
whisper:

View File

@@ -3,6 +3,8 @@
# Copyright (C) 2024 Advanced Micro Devices, Inc.
# SPDX-License-Identifier: Apache-2.0
export MAX_INPUT_TOKENS=2048
export MAX_TOTAL_TOKENS=4096
export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
export HOST_IP=${host_ip}

View File

@@ -223,11 +223,12 @@ You will have the following Docker Images:
Text:
```bash
## json input
curl -X POST http://${host_ip}:8888/v1/docsum \
-H "Content-Type: application/json" \
-d '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
# Use English mode (default).
# form input, use English mode (default).
curl http://${host_ip}:8888/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
@@ -290,6 +291,93 @@ You will have the following Docker Images:
-F "stream=true"
```
7. MegaService with long context
If you want to deal with long context, can set following parameters and select suitable summary type.
- "summary_type": can be "auto", "stuff", "truncate", "map_reduce", "refine", default is "auto"
- "chunk_size": max token length for each chunk. Set to be different default value according to "summary_type".
- "chunk_overlap": overlap token length between each chunk, default is 0.1\*chunk_size
**summary_type=auto**
"summary_type" is set to be "auto" by default, in this mode we will check input token length, if it exceed `MAX_INPUT_TOKENS`, `summary_type` will automatically be set to `refine` mode, otherwise will be set to `stuff` mode.
```bash
curl http://${host_ip}:8888/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
-F "max_tokens=32" \
-F "files=@/path to your file (.txt, .docx, .pdf)" \
-F "language=en" \
-F "summary_type=auto"
```
**summary_type=stuff**
In this mode LLM generate summary based on complete input text. In this case please carefully set `MAX_INPUT_TOKENS` and `MAX_TOTAL_TOKENS` according to your model and device memory, otherwise it may exceed LLM context limit and raise error when meet long context.
```bash
curl http://${host_ip}:8888/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
-F "max_tokens=32" \
-F "files=@/path to your file (.txt, .docx, .pdf)" \
-F "language=en" \
-F "summary_type=stuff"
```
**summary_type=truncate**
Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
```bash
curl http://${host_ip}:8888/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
-F "max_tokens=32" \
-F "files=@/path to your file (.txt, .docx, .pdf)" \
-F "language=en" \
-F "summary_type=truncate"
```
**summary_type=map_reduce**
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
```bash
curl http://${host_ip}:8888/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
-F "max_tokens=32" \
-F "files=@/path to your file (.txt, .docx, .pdf)" \
-F "language=en" \
-F "summary_type=map_reduce"
```
**summary_type=refine**
Refin mode will split the inputs into multiple chunks, generate summary for the first one, then combine with the second, loops over every remaining chunks to get the final summary.
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`.
```bash
curl http://${host_ip}:8888/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
-F "max_tokens=32" \
-F "files=@/path to your file (.txt, .docx, .pdf)" \
-F "language=en" \
-F "summary_type=refine"
```
## 🚀 Launch the UI
Several UI options are provided. If you need to work with multimedia documents, .doc, or .pdf files, suggested to use Gradio UI.

View File

@@ -2,9 +2,9 @@
# SPDX-License-Identifier: Apache-2.0
services:
tgi-service:
tgi-server:
image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
container_name: tgi-service
container_name: tgi-server
ports:
- "8008:80"
environment:
@@ -16,13 +16,13 @@ services:
volumes:
- "./data:/data"
shm_size: 1g
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0 --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
llm-docsum-tgi:
image: ${REGISTRY:-opea}/llm-docsum-tgi:${TAG:-latest}
container_name: llm-docsum-server
depends_on:
- tgi-service
- tgi-server
ports:
- "9000:9000"
ipc: host
@@ -32,11 +32,15 @@ services:
https_proxy: ${https_proxy}
TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
LLM_MODEL_ID: ${LLM_MODEL_ID}
LOGFLAG: True
restart: unless-stopped
whisper:
image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
container_name: whisper-service
container_name: whisper-server
ports:
- "7066:7066"
ipc: host
@@ -48,7 +52,7 @@ services:
dataprep-audio2text:
image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest}
container_name: dataprep-audio2text-service
container_name: dataprep-audio2text-server
ports:
- "9099:9099"
ipc: host
@@ -57,7 +61,7 @@ services:
dataprep-video2audio:
image: ${REGISTRY:-opea}/dataprep-video2audio:${TAG:-latest}
container_name: dataprep-video2audio-service
container_name: dataprep-video2audio-server
ports:
- "7078:7078"
ipc: host
@@ -78,7 +82,7 @@ services:
image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
container_name: docsum-xeon-backend-server
depends_on:
- tgi-service
- tgi-server
- llm-docsum-tgi
- dataprep-multimedia2text
- dataprep-video2audio

View File

@@ -207,18 +207,19 @@ You will have the following Docker Images:
Text:
```bash
## json input
curl -X POST http://${host_ip}:8888/v1/docsum \
-H "Content-Type: application/json" \
-d '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
# Use English mode (default).
# form input. Use English mode (default).
curl http://${host_ip}:8888/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
-F "max_tokens=32" \
-F "language=en" \
-F "stream=true"
-F "stream=True"
# Use Chinese mode.
curl http://${host_ip}:8888/v1/docsum \
@@ -227,7 +228,7 @@ You will have the following Docker Images:
-F "messages=2024年9月26日北京——今日英特尔正式发布英特尔® 至强® 6性能核处理器代号Granite Rapids为AI、数据分析、科学计算等计算密集型业务提供卓越性能。" \
-F "max_tokens=32" \
-F "language=zh" \
-F "stream=true"
-F "stream=True"
# Upload file
curl http://${host_ip}:8888/v1/docsum \
@@ -237,7 +238,6 @@ You will have the following Docker Images:
-F "files=@/path to your file (.txt, .docx, .pdf)" \
-F "max_tokens=32" \
-F "language=en" \
-F "stream=true"
```
> Audio and Video file uploads are not supported in docsum with curl request, please use the Gradio-UI.
@@ -255,7 +255,7 @@ You will have the following Docker Images:
-F "messages=UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA" \
-F "max_tokens=32" \
-F "language=en" \
-F "stream=true"
-F "stream=True"
```
Video:
@@ -271,7 +271,94 @@ You will have the following Docker Images:
-F "messages=convert your video to base64 data type" \
-F "max_tokens=32" \
-F "language=en" \
-F "stream=true"
-F "stream=True"
```
7. MegaService with long context
If you want to deal with long context, can set following parameters and select suitable summary type.
- "summary_type": can be "auto", "stuff", "truncate", "map_reduce", "refine", default is "auto"
- "chunk_size": max token length for each chunk. Set to be different default value according to "summary_type".
- "chunk_overlap": overlap token length between each chunk, default is 0.1\*chunk_size
**summary_type=auto**
"summary_type" is set to be "auto" by default, in this mode we will check input token length, if it exceed `MAX_INPUT_TOKENS`, `summary_type` will automatically be set to `refine` mode, otherwise will be set to `stuff` mode.
```bash
curl http://${host_ip}:8888/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
-F "max_tokens=32" \
-F "files=@/path to your file (.txt, .docx, .pdf)" \
-F "language=en" \
-F "summary_type=auto"
```
**summary_type=stuff**
In this mode LLM generate summary based on complete input text. In this case please carefully set `MAX_INPUT_TOKENS` and `MAX_TOTAL_TOKENS` according to your model and device memory, otherwise it may exceed LLM context limit and raise error when meet long context.
```bash
curl http://${host_ip}:8888/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
-F "max_tokens=32" \
-F "files=@/path to your file (.txt, .docx, .pdf)" \
-F "language=en" \
-F "summary_type=stuff"
```
**summary_type=truncate**
Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
```bash
curl http://${host_ip}:8888/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
-F "max_tokens=32" \
-F "files=@/path to your file (.txt, .docx, .pdf)" \
-F "language=en" \
-F "summary_type=truncate"
```
**summary_type=map_reduce**
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
```bash
curl http://${host_ip}:8888/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
-F "max_tokens=32" \
-F "files=@/path to your file (.txt, .docx, .pdf)" \
-F "language=en" \
-F "summary_type=map_reduce"
```
**summary_type=refine**
Refin mode will split the inputs into multiple chunks, generate summary for the first one, then combine with the second, loops over every remaining chunks to get the final summary.
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`.
```bash
curl http://${host_ip}:8888/v1/docsum \
-H "Content-Type: multipart/form-data" \
-F "type=text" \
-F "messages=" \
-F "max_tokens=32" \
-F "files=@/path to your file (.txt, .docx, .pdf)" \
-F "language=en" \
-F "summary_type=refine"
```
> More detailed tests can be found here `cd GenAIExamples/DocSum/test`

View File

@@ -2,7 +2,7 @@
# SPDX-License-Identifier: Apache-2.0
services:
tgi-service:
tgi-server:
image: ghcr.io/huggingface/tgi-gaudi:2.0.6
container_name: tgi-gaudi-server
ports:
@@ -23,13 +23,13 @@ services:
cap_add:
- SYS_NICE
ipc: host
command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}
llm-docsum-tgi:
image: ${REGISTRY:-opea}/llm-docsum-tgi:${TAG:-latest}
container_name: llm-docsum-gaudi-server
depends_on:
- tgi-service
- tgi-server
ports:
- "9000:9000"
ipc: host
@@ -39,11 +39,15 @@ services:
https_proxy: ${https_proxy}
TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
LLM_MODEL_ID: ${LLM_MODEL_ID}
LOGFLAG: True
restart: unless-stopped
whisper:
image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
container_name: whisper-service
container_name: whisper-server
ports:
- "7066:7066"
ipc: host
@@ -60,7 +64,7 @@ services:
dataprep-audio2text:
image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest}
container_name: dataprep-audio2text-service
container_name: dataprep-audio2text-server
ports:
- "9199:9099"
ipc: host
@@ -69,7 +73,7 @@ services:
dataprep-video2audio:
image: ${REGISTRY:-opea}/dataprep-video2audio:${TAG:-latest}
container_name: dataprep-video2audio-service
container_name: dataprep-video2audio-server
ports:
- "7078:7078"
ipc: host
@@ -90,7 +94,7 @@ services:
image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
container_name: docsum-gaudi-backend-server
depends_on:
- tgi-service
- tgi-server
- llm-docsum-tgi
- dataprep-multimedia2text
- dataprep-video2audio

View File

@@ -6,6 +6,9 @@ pushd "../../" > /dev/null
source .set_env.sh
popd > /dev/null
export MAX_INPUT_TOKENS=1024
export MAX_TOTAL_TOKENS=2048
export no_proxy="${no_proxy},${host_ip}"
export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
export MEGA_SERVICE_HOST_IP=${host_ip}