Support Long context for DocSum (#1255)

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: lkk <33276950+lkk12014402@users.noreply.github.com>
2024-12-20 19:17:10 +08:00
parent 05365b6140
commit 50dd959d60
15 changed files with 861 additions and 267 deletions
--- a/DocSum/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/DocSum/docker_compose/amd/gpu/rocm/compose.yaml
@@ -27,7 +27,7 @@ services:
    security_opt:
      - seccomp:unconfined
    ipc: host
-    command: --model-id ${DOCSUM_LLM_MODEL_ID}
+    command: --model-id ${DOCSUM_LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}

  docsum-llm-server:
    image: ${REGISTRY:-opea}/llm-docsum-tgi:${TAG:-latest}
@@ -53,6 +53,9 @@ services:
      https_proxy: ${https_proxy}
      TGI_LLM_ENDPOINT: "http://${HOST_IP}:${DOCSUM_TGI_SERVICE_PORT}"
      HUGGINGFACEHUB_API_TOKEN: ${DOCSUM_HUGGINGFACEHUB_API_TOKEN}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
+      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
+      LLM_MODEL_ID: ${DOCSUM_LLM_MODEL_ID}
    restart: unless-stopped

  whisper:
--- a/DocSum/docker_compose/amd/gpu/rocm/set_env.sh
+++ b/DocSum/docker_compose/amd/gpu/rocm/set_env.sh
@@ -3,6 +3,8 @@
 # Copyright (C) 2024 Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: Apache-2.0

+export MAX_INPUT_TOKENS=2048
+export MAX_TOTAL_TOKENS=4096
 export DOCSUM_TGI_IMAGE="ghcr.io/huggingface/text-generation-inference:2.3.1-rocm"
 export DOCSUM_LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
 export HOST_IP=${host_ip}
--- a/DocSum/docker_compose/intel/cpu/xeon/README.md
+++ b/DocSum/docker_compose/intel/cpu/xeon/README.md
@@ -223,11 +223,12 @@ You will have the following Docker Images:
   Text:

   ```bash
+   ## json input
   curl -X POST http://${host_ip}:8888/v1/docsum \
        -H "Content-Type: application/json" \
        -d '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'

-   # Use English mode (default).
+   # form input, use English mode (default).
   curl http://${host_ip}:8888/v1/docsum \
       -H "Content-Type: multipart/form-data" \
       -F "type=text" \
@@ -290,6 +291,93 @@ You will have the following Docker Images:
      -F "stream=true"
   ```

+7. MegaService with long context
+
+   If you want to deal with long context, can set following parameters and select suitable summary type.
+
+   - "summary_type": can be "auto", "stuff", "truncate", "map_reduce", "refine", default is "auto"
+   - "chunk_size": max token length for each chunk. Set to be different default value according to "summary_type".
+   - "chunk_overlap": overlap token length between each chunk, default is 0.1\*chunk_size
+
+   **summary_type=auto**
+
+   "summary_type" is set to be "auto" by default, in this mode we will check input token length, if it exceed `MAX_INPUT_TOKENS`, `summary_type` will automatically be set to `refine` mode, otherwise will be set to `stuff` mode.
+
+   ```bash
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=text" \
+      -F "messages=" \
+      -F "max_tokens=32" \
+      -F "files=@/path to your file (.txt, .docx, .pdf)" \
+      -F "language=en" \
+      -F "summary_type=auto"
+   ```
+
+   **summary_type=stuff**
+
+   In this mode LLM generate summary based on complete input text. In this case please carefully set `MAX_INPUT_TOKENS` and `MAX_TOTAL_TOKENS` according to your model and device memory, otherwise it may exceed LLM context limit and raise error when meet long context.
+
+   ```bash
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=text" \
+      -F "messages=" \
+      -F "max_tokens=32" \
+      -F "files=@/path to your file (.txt, .docx, .pdf)" \
+      -F "language=en" \
+      -F "summary_type=stuff"
+   ```
+
+   **summary_type=truncate**
+
+   Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
+
+   ```bash
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=text" \
+      -F "messages=" \
+      -F "max_tokens=32" \
+      -F "files=@/path to your file (.txt, .docx, .pdf)" \
+      -F "language=en" \
+      -F "summary_type=truncate"
+   ```
+
+   **summary_type=map_reduce**
+
+   Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
+
+   In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
+
+   ```bash
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=text" \
+      -F "messages=" \
+      -F "max_tokens=32" \
+      -F "files=@/path to your file (.txt, .docx, .pdf)" \
+      -F "language=en" \
+      -F "summary_type=map_reduce"
+   ```
+
+   **summary_type=refine**
+
+   Refin mode will split the inputs into multiple chunks, generate summary for the first one, then combine with the second, loops over every remaining chunks to get the final summary.
+
+   In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`.
+
+   ```bash
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=text" \
+      -F "messages=" \
+      -F "max_tokens=32" \
+      -F "files=@/path to your file (.txt, .docx, .pdf)" \
+      -F "language=en" \
+      -F "summary_type=refine"
+   ```
+
 ## 🚀 Launch the UI

 Several UI options are provided. If you need to work with multimedia documents, .doc, or .pdf files, suggested to use Gradio UI.
--- a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
@@ -2,9 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0

 services:
-  tgi-service:
+  tgi-server:
    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
-    container_name: tgi-service
+    container_name: tgi-server
    ports:
      - "8008:80"
    environment:
@@ -16,13 +16,13 @@ services:
    volumes:
      - "./data:/data"
    shm_size: 1g
-    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0  --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}

  llm-docsum-tgi:
    image: ${REGISTRY:-opea}/llm-docsum-tgi:${TAG:-latest}
    container_name: llm-docsum-server
    depends_on:
-      - tgi-service
+      - tgi-server
    ports:
      - "9000:9000"
    ipc: host
@@ -32,11 +32,15 @@ services:
      https_proxy: ${https_proxy}
      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
+      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      LOGFLAG: True
    restart: unless-stopped

  whisper:
    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
-    container_name: whisper-service
+    container_name: whisper-server
    ports:
      - "7066:7066"
    ipc: host
@@ -48,7 +52,7 @@ services:

  dataprep-audio2text:
    image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest}
-    container_name: dataprep-audio2text-service
+    container_name: dataprep-audio2text-server
    ports:
      - "9099:9099"
    ipc: host
@@ -57,7 +61,7 @@ services:

  dataprep-video2audio:
    image: ${REGISTRY:-opea}/dataprep-video2audio:${TAG:-latest}
-    container_name: dataprep-video2audio-service
+    container_name: dataprep-video2audio-server
    ports:
      - "7078:7078"
    ipc: host
@@ -78,7 +82,7 @@ services:
    image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
    container_name: docsum-xeon-backend-server
    depends_on:
-      - tgi-service
+      - tgi-server
      - llm-docsum-tgi
      - dataprep-multimedia2text
      - dataprep-video2audio
--- a/DocSum/docker_compose/intel/hpu/gaudi/README.md
+++ b/DocSum/docker_compose/intel/hpu/gaudi/README.md
@@ -207,18 +207,19 @@ You will have the following Docker Images:
   Text:

   ```bash
+   ## json input
   curl -X POST http://${host_ip}:8888/v1/docsum \
        -H "Content-Type: application/json" \
        -d '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'

-   # Use English mode (default).
+   # form input. Use English mode (default).
   curl http://${host_ip}:8888/v1/docsum \
       -H "Content-Type: multipart/form-data" \
       -F "type=text" \
       -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
       -F "max_tokens=32" \
       -F "language=en" \
-       -F "stream=true"
+       -F "stream=True"

   # Use Chinese mode.
   curl http://${host_ip}:8888/v1/docsum \
@@ -227,7 +228,7 @@ You will have the following Docker Images:
       -F "messages=2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。" \
       -F "max_tokens=32" \
       -F "language=zh" \
-       -F "stream=true"
+       -F "stream=True"

   # Upload file
   curl http://${host_ip}:8888/v1/docsum \
@@ -237,7 +238,6 @@ You will have the following Docker Images:
      -F "files=@/path to your file (.txt, .docx, .pdf)" \
      -F "max_tokens=32" \
      -F "language=en" \
-      -F "stream=true"
   ```

   > Audio and Video file uploads are not supported in docsum with curl request, please use the Gradio-UI.
@@ -255,7 +255,7 @@ You will have the following Docker Images:
      -F "messages=UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA" \
      -F "max_tokens=32" \
      -F "language=en" \
-      -F "stream=true"
+      -F "stream=True"
   ```

   Video:
@@ -271,7 +271,94 @@ You will have the following Docker Images:
      -F "messages=convert your video to base64 data type" \
      -F "max_tokens=32" \
      -F "language=en" \
-      -F "stream=true"
+      -F "stream=True"
+   ```
+
+7. MegaService with long context
+
+   If you want to deal with long context, can set following parameters and select suitable summary type.
+
+   - "summary_type": can be "auto", "stuff", "truncate", "map_reduce", "refine", default is "auto"
+   - "chunk_size": max token length for each chunk. Set to be different default value according to "summary_type".
+   - "chunk_overlap": overlap token length between each chunk, default is 0.1\*chunk_size
+
+   **summary_type=auto**
+
+   "summary_type" is set to be "auto" by default, in this mode we will check input token length, if it exceed `MAX_INPUT_TOKENS`, `summary_type` will automatically be set to `refine` mode, otherwise will be set to `stuff` mode.
+
+   ```bash
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=text" \
+      -F "messages=" \
+      -F "max_tokens=32" \
+      -F "files=@/path to your file (.txt, .docx, .pdf)" \
+      -F "language=en" \
+      -F "summary_type=auto"
+   ```
+
+   **summary_type=stuff**
+
+   In this mode LLM generate summary based on complete input text. In this case please carefully set `MAX_INPUT_TOKENS` and `MAX_TOTAL_TOKENS` according to your model and device memory, otherwise it may exceed LLM context limit and raise error when meet long context.
+
+   ```bash
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=text" \
+      -F "messages=" \
+      -F "max_tokens=32" \
+      -F "files=@/path to your file (.txt, .docx, .pdf)" \
+      -F "language=en" \
+      -F "summary_type=stuff"
+   ```
+
+   **summary_type=truncate**
+
+   Truncate mode will truncate the input text and keep only the first chunk, whose length is equal to `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
+
+   ```bash
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=text" \
+      -F "messages=" \
+      -F "max_tokens=32" \
+      -F "files=@/path to your file (.txt, .docx, .pdf)" \
+      -F "language=en" \
+      -F "summary_type=truncate"
+   ```
+
+   **summary_type=map_reduce**
+
+   Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
+
+   In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
+
+   ```bash
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=text" \
+      -F "messages=" \
+      -F "max_tokens=32" \
+      -F "files=@/path to your file (.txt, .docx, .pdf)" \
+      -F "language=en" \
+      -F "summary_type=map_reduce"
+   ```
+
+   **summary_type=refine**
+
+   Refin mode will split the inputs into multiple chunks, generate summary for the first one, then combine with the second, loops over every remaining chunks to get the final summary.
+
+   In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * input.max_tokens - 128, MAX_INPUT_TOKENS)`.
+
+   ```bash
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=text" \
+      -F "messages=" \
+      -F "max_tokens=32" \
+      -F "files=@/path to your file (.txt, .docx, .pdf)" \
+      -F "language=en" \
+      -F "summary_type=refine"
   ```

 > More detailed tests can be found here `cd GenAIExamples/DocSum/test`
--- a/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0

 services:
-  tgi-service:
+  tgi-server:
    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
    container_name: tgi-gaudi-server
    ports:
@@ -23,13 +23,13 @@ services:
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
+    command: --model-id ${LLM_MODEL_ID} --max-input-length ${MAX_INPUT_TOKENS} --max-total-tokens ${MAX_TOTAL_TOKENS}

  llm-docsum-tgi:
    image: ${REGISTRY:-opea}/llm-docsum-tgi:${TAG:-latest}
    container_name: llm-docsum-gaudi-server
    depends_on:
-      - tgi-service
+      - tgi-server
    ports:
      - "9000:9000"
    ipc: host
@@ -39,11 +39,15 @@ services:
      https_proxy: ${https_proxy}
      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
+      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      LOGFLAG: True
    restart: unless-stopped

  whisper:
    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
-    container_name: whisper-service
+    container_name: whisper-server
    ports:
      - "7066:7066"
    ipc: host
@@ -60,7 +64,7 @@ services:

  dataprep-audio2text:
    image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest}
-    container_name: dataprep-audio2text-service
+    container_name: dataprep-audio2text-server
    ports:
      - "9199:9099"
    ipc: host
@@ -69,7 +73,7 @@ services:

  dataprep-video2audio:
    image: ${REGISTRY:-opea}/dataprep-video2audio:${TAG:-latest}
-    container_name: dataprep-video2audio-service
+    container_name: dataprep-video2audio-server
    ports:
      - "7078:7078"
    ipc: host
@@ -90,7 +94,7 @@ services:
    image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
    container_name: docsum-gaudi-backend-server
    depends_on:
-      - tgi-service
+      - tgi-server
      - llm-docsum-tgi
      - dataprep-multimedia2text
      - dataprep-video2audio
--- a/DocSum/docker_compose/set_env.sh
+++ b/DocSum/docker_compose/set_env.sh
@@ -6,6 +6,9 @@ pushd "../../" > /dev/null
 source .set_env.sh
 popd > /dev/null

+export MAX_INPUT_TOKENS=1024
+export MAX_TOTAL_TOKENS=2048
+
 export no_proxy="${no_proxy},${host_ip}"
 export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
 export MEGA_SERVICE_HOST_IP=${host_ip}