Refactor FaqGen (#1093)

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
XinyaoWa
2025-01-13 11:30:59 +08:00
committed by GitHub
parent 3f23bf582a
commit ea72c943bd
31 changed files with 962 additions and 551 deletions

View File

@@ -0,0 +1,50 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
version: "3.8"
services:
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
container_name: tgi-server
ports:
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- "./data:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
host_ip: ${host_ip}
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
healthcheck:
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
llm:
image: opea/llm-faqgen:latest
container_name: llm-faqgen-server
depends_on:
tgi-service:
condition: service_healthy
ports:
- ${FAQ_PORT:-9000}:9000
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
LLM_ENDPOINT: ${LLM_ENDPOINT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME}
LOGFLAG: ${LOGFLAG:-False}
restart: unless-stopped
networks:
default:
driver: bridge

View File

@@ -0,0 +1,61 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
version: "3.8"
services:
tgi-service:
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
container_name: tgi-gaudi-server
ports:
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- "./data:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
host_ip: ${host_ip}
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
runtime: habana
cap_add:
- SYS_NICE
ipc: host
healthcheck:
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
llm:
image: opea/llm-faqgen:latest
container_name: llm-faqgen-server
depends_on:
tgi-service:
condition: service_healthy
ports:
- ${FAQ_PORT:-9000}:9000
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
LLM_ENDPOINT: ${LLM_ENDPOINT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME}
LOGFLAG: ${LOGFLAG:-False}
restart: unless-stopped
networks:
default:
driver: bridge

View File

@@ -0,0 +1,53 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
version: "3.8"
services:
vllm-service:
image: opea/vllm:latest
container_name: vllm-server
ports:
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- "./data:/data"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
LLM_MODEL_ID: ${LLM_MODEL_ID}
VLLM_TORCH_PROFILER_DIR: "/mnt"
host_ip: ${host_ip}
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
healthcheck:
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
llm:
image: opea/llm-faqgen:latest
container_name: llm-faqgen-server
depends_on:
vllm-service:
condition: service_healthy
ports:
- ${FAQ_PORT:-9000}:9000
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
LLM_ENDPOINT: ${LLM_ENDPOINT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME}
LOGFLAG: ${LOGFLAG:-False}
restart: unless-stopped
networks:
default:
driver: bridge

View File

@@ -8,37 +8,49 @@ services:
image: opea/vllm-gaudi:latest
container_name: vllm-gaudi-server
ports:
- "8008:80"
- ${LLM_ENDPOINT_PORT:-8008}:80
volumes:
- "./data:/data"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HF_TOKEN}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
LLM_MODEL_ID: ${LLM_MODEL_ID}
VLLM_TORCH_PROFILER_DIR: "/mnt"
host_ip: ${host_ip}
LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
runtime: habana
cap_add:
- SYS_NICE
ipc: host
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80
healthcheck:
test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
llm:
image: opea/llm-faqgen-vllm:latest
image: opea/llm-faqgen:latest
container_name: llm-faqgen-server
depends_on:
- vllm-service
vllm-service:
condition: service_healthy
ports:
- "9000:9000"
- ${FAQ_PORT:-9000}:9000
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
vLLM_ENDPOINT: ${vLLM_ENDPOINT}
HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
LLM_ENDPOINT: ${LLM_ENDPOINT}
LLM_MODEL_ID: ${LLM_MODEL_ID}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME}
LOGFLAG: ${LOGFLAG:-False}
restart: unless-stopped
networks:

View File

@@ -1,75 +0,0 @@
# TGI FAQGen LLM Microservice
This microservice interacts with the TGI LLM server to generate FAQs from Input Text.[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more.
## 🚀1. Start Microservice with Docker
If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a TGI service with docker.
### 1.1 Setup Environment Variables
In order to start TGI and LLM services, you need to setup the following environment variables first.
```bash
export HF_TOKEN=${your_hf_api_token}
export TGI_LLM_ENDPOINT="http://${your_ip}:8008"
export LLM_MODEL_ID=${your_hf_llm_model}
```
### 1.2 Build Docker Image
```bash
cd ../../../../../
docker build -t opea/llm-faqgen-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/faq-generation/tgi/langchain/Dockerfile .
```
To start a docker container, you have two options:
- A. Run Docker with CLI
- B. Run Docker with Docker Compose
You can choose one as needed.
### 1.3 Run Docker with CLI (Option A)
```bash
docker run -d -p 8008:80 -v ./data:/data --name tgi_service --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.4 --model-id ${LLM_MODEL_ID}
```
```bash
docker run -d --name="llm-faqgen-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN opea/llm-faqgen-tgi:latest
```
### 1.4 Run Docker with Docker Compose (Option B)
```bash
docker compose -f docker_compose_llm.yaml up -d
```
## 🚀3. Consume LLM Service
### 3.1 Check Service Status
```bash
curl http://${your_ip}:9000/v1/health_check\
-X GET \
-H 'Content-Type: application/json'
```
### 3.2 Consume FAQGen LLM Service
```bash
# Streaming Response
# Set stream to True. Default will be True.
curl http://${your_ip}:9000/v1/faqgen \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
-H 'Content-Type: application/json'
# Non-Streaming Response
# Set stream to False.
curl http://${your_ip}:9000/v1/faqgen \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "stream":false}' \
-H 'Content-Type: application/json'
```

View File

@@ -1,34 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
version: "3.8"
services:
tgi_service:
image: ghcr.io/huggingface/text-generation-inference:1.4
container_name: tgi-service
ports:
- "8008:80"
volumes:
- "./data:/data"
environment:
HF_TOKEN: ${HF_TOKEN}
shm_size: 1g
command: --model-id ${LLM_MODEL_ID}
llm:
image: opea/llm-faqgen-tgi:latest
container_name: llm-faqgen-server
ports:
- "9000:9000"
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
restart: unless-stopped
networks:
default:
driver: bridge

View File

@@ -1,8 +0,0 @@
#!/usr/bin/env bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
pip --no-cache-dir install -r requirements-runtime.txt
python llm.py

View File

@@ -1,100 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
from fastapi.responses import StreamingResponse
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.llms import HuggingFaceEndpoint
from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
from comps.cores.mega.utils import get_access_token
logger = CustomLogger("llm_faqgen")
logflag = os.getenv("LOGFLAG", False)
# Environment variables
TOKEN_URL = os.getenv("TOKEN_URL")
CLIENTID = os.getenv("CLIENTID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
def post_process_text(text: str):
if text == " ":
return "data: @#$\n\n"
if text == "\n":
return "data: <br/>\n\n"
if text.isspace():
return None
new_text = text.replace(" ", "@#$")
return f"data: {new_text}\n\n"
@register_microservice(
name="opea_service@llm_faqgen",
service_type=ServiceType.LLM,
endpoint="/v1/faqgen",
host="0.0.0.0",
port=9000,
)
async def llm_generate(input: LLMParamsDoc):
if logflag:
logger.info(input)
access_token = (
get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
)
server_kwargs = {}
if access_token:
server_kwargs["headers"] = {"Authorization": f"Bearer {access_token}"}
llm = HuggingFaceEndpoint(
endpoint_url=llm_endpoint,
max_new_tokens=input.max_tokens,
top_k=input.top_k,
top_p=input.top_p,
typical_p=input.typical_p,
temperature=input.temperature,
repetition_penalty=input.repetition_penalty,
streaming=input.stream,
server_kwargs=server_kwargs,
)
templ = """Create a concise FAQs (frequently asked questions and answers) for following text:
TEXT: {text}
Do not use any prefix or suffix to the FAQ.
"""
PROMPT = PromptTemplate.from_template(templ)
llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
texts = text_splitter.split_text(input.query)
# Create multiple documents
docs = [Document(page_content=t) for t in texts]
if input.stream:
async def stream_generator():
from langserve.serialization import WellKnownLCSerializer
_serializer = WellKnownLCSerializer()
async for chunk in llm_chain.astream_log(docs):
data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
if logflag:
logger.info(data)
yield f"data: {data}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
response = await llm_chain.ainvoke(docs)
response = response["output_text"]
if logflag:
logger.info(response)
return GeneratedDoc(text=response, prompt=input.query)
if __name__ == "__main__":
llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
# Split text
text_splitter = CharacterTextSplitter()
opea_microservices["opea_service@llm_faqgen"].start()

View File

@@ -1,25 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
FROM python:3.11-slim
RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
libgl1-mesa-glx \
libjemalloc-dev
RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/
USER user
COPY comps /home/user/comps
RUN pip install --no-cache-dir --upgrade pip setuptools && \
pip install --no-cache-dir -r /home/user/comps/llms/faq-generation/vllm/langchain/requirements.txt
ENV PYTHONPATH=$PYTHONPATH:/home/user
WORKDIR /home/user/comps/llms/faq-generation/vllm/langchain
ENTRYPOINT ["bash", "entrypoint.sh"]

View File

@@ -1,77 +0,0 @@
# vLLM FAQGen LLM Microservice
This microservice interacts with the vLLM server to generate FAQs from Input Text.[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference and serving, it delivers state-of-the-art serving throughput with a set of advanced features such as PagedAttention, Continuous batching and etc.. Besides GPUs, vLLM already supported [Intel CPUs](https://www.intel.com/content/www/us/en/products/overview.html) and [Gaudi accelerators](https://habana.ai/products).
## 🚀1. Start Microservice with Docker
If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a VLLM service with docker.
To setup or build the vLLM image follow the instructions provided in [vLLM Gaudi](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation/vllm/langchain#22-vllm-on-gaudi)
### 1.1 Setup Environment Variables
In order to start vLLM and LLM services, you need to setup the following environment variables first.
```bash
export HF_TOKEN=${your_hf_api_token}
export vLLM_ENDPOINT="http://${your_ip}:8008"
export LLM_MODEL_ID=${your_hf_llm_model}
```
### 1.3 Build Docker Image
```bash
cd ../../../../../
docker build -t opea/llm-faqgen-vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/faq-generation/vllm/langchain/Dockerfile .
```
To start a docker container, you have two options:
- A. Run Docker with CLI
- B. Run Docker with Docker Compose
You can choose one as needed.
### 1.3 Run Docker with CLI (Option A)
```bash
docker run -d -p 8008:80 -v ./data:/data --name vllm-service --shm-size 1g opea/vllm-gaudi:latest --model-id ${LLM_MODEL_ID}
```
```bash
docker run -d --name="llm-faqgen-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e vLLM_ENDPOINT=$vLLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN opea/llm-faqgen-vllm:latest
```
### 1.4 Run Docker with Docker Compose (Option B)
```bash
docker compose -f docker_compose_llm.yaml up -d
```
## 🚀3. Consume LLM Service
### 3.1 Check Service Status
```bash
curl http://${your_ip}:9000/v1/health_check\
-X GET \
-H 'Content-Type: application/json'
```
### 3.2 Consume FAQGen LLM Service
```bash
# Streaming Response
# Set stream to True. Default will be True.
curl http://${your_ip}:9000/v1/faqgen \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
-H 'Content-Type: application/json'
# Non-Streaming Response
# Set stream to False.
curl http://${your_ip}:9000/v1/faqgen \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "stream":false}' \
-H 'Content-Type: application/json'
```

View File

@@ -1,2 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

View File

@@ -1,102 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
from fastapi.responses import StreamingResponse
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.llms import VLLMOpenAI
from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
from comps.cores.mega.utils import get_access_token
logger = CustomLogger("llm_faqgen")
logflag = os.getenv("LOGFLAG", False)
# Environment variables
TOKEN_URL = os.getenv("TOKEN_URL")
CLIENTID = os.getenv("CLIENTID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
def post_process_text(text: str):
if text == " ":
return "data: @#$\n\n"
if text == "\n":
return "data: <br/>\n\n"
if text.isspace():
return None
new_text = text.replace(" ", "@#$")
return f"data: {new_text}\n\n"
@register_microservice(
name="opea_service@llm_faqgen",
service_type=ServiceType.LLM,
endpoint="/v1/faqgen",
host="0.0.0.0",
port=9000,
)
async def llm_generate(input: LLMParamsDoc):
if logflag:
logger.info(input)
access_token = (
get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
)
headers = {}
if access_token:
headers = {"Authorization": f"Bearer {access_token}"}
model = input.model if input.model else os.getenv("LLM_MODEL_ID")
llm = VLLMOpenAI(
openai_api_key="EMPTY",
openai_api_base=llm_endpoint + "/v1",
model_name=model,
default_headers=headers,
max_tokens=input.max_tokens,
top_p=input.top_p,
streaming=input.stream,
temperature=input.temperature,
)
templ = """Create a concise FAQs (frequently asked questions and answers) for following text:
TEXT: {text}
Do not use any prefix or suffix to the FAQ.
"""
PROMPT = PromptTemplate.from_template(templ)
llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
texts = text_splitter.split_text(input.query)
# Create multiple documents
docs = [Document(page_content=t) for t in texts]
if input.stream:
async def stream_generator():
from langserve.serialization import WellKnownLCSerializer
_serializer = WellKnownLCSerializer()
async for chunk in llm_chain.astream_log(docs):
data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
if logflag:
logger.info(data)
yield f"data: {data}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
response = await llm_chain.ainvoke(docs)
response = response["output_text"]
if logflag:
logger.info(response)
return GeneratedDoc(text=response, prompt=input.query)
if __name__ == "__main__":
llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8080")
# Split text
text_splitter = CharacterTextSplitter()
opea_microservices["opea_service@llm_faqgen"].start()

View File

@@ -1,15 +0,0 @@
docarray[full]
fastapi
huggingface_hub
langchain
langchain-huggingface
langchain-openai
langchain_community
langchainhub
opentelemetry-api
opentelemetry-exporter-otlp
opentelemetry-sdk
prometheus-fastapi-instrumentator
shortuuid
transformers
uvicorn

View File

@@ -16,10 +16,10 @@ USER user
COPY comps /home/user/comps
RUN pip install --no-cache-dir --upgrade pip setuptools && \
pip install --no-cache-dir -r /home/user/comps/llms/faq-generation/tgi/langchain/requirements.txt
pip install --no-cache-dir -r /home/user/comps/llms/src/faq-generation/requirements.txt
ENV PYTHONPATH=$PYTHONPATH:/home/user
WORKDIR /home/user/comps/llms/faq-generation/tgi/langchain
WORKDIR /home/user/comps/llms/src/faq-generation
ENTRYPOINT ["bash", "entrypoint.sh"]

View File

@@ -0,0 +1,110 @@
# FAQGen LLM Microservice
This microservice interacts with the TGI/vLLM LLM server to generate FAQs(frequently asked questions and answers) from Input Text. You can set backend service either [TGI](../../../third_parties/tgi) or [vLLM](../../../third_parties/vllm).
## 🚀1. Start Microservice with Docker
### 1.1 Setup Environment Variables
In order to start FaqGen microservices, you need to setup the following environment variables first.
```bash
export host_ip=${your_host_ip}
export LLM_ENDPOINT_PORT=8008
export FAQ_PORT=9000
export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
export LLM_MODEL_ID=${your_hf_llm_model}
export FAQGen_COMPONENT_NAME="OPEAFAQGen_TGI" # or "vllm"
```
### 1.2 Build Docker Image
Step 1: Prepare backend LLM docker image.
If you want to use vLLM backend, refer to [vLLM](../../../third_parties/vllm/src) to build vLLM docker images first.
No need for TGI.
Step 2: Build FaqGen docker image.
```bash
cd ../../../../
docker build -t opea/llm-faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/faq-generation/Dockerfile .
```
### 1.3 Run Docker
To start a docker container, you have two options:
- A. Run Docker with CLI
- B. Run Docker with Docker Compose
You can choose one as needed.
#### 1.3.1 Run Docker with CLI (Option A)
Step 1: Start the backend LLM service
Please refer to [TGI](../../../third_parties/tgi/deployment/docker_compose/) or [vLLM](../../../third_parties/vllm/deployment/docker_compose/) guideline to start a backend LLM service.
Step 2: Start the FaqGen microservices
```bash
docker run -d \
--name="llm-faqgen-server" \
-p 9000:9000 \
--ipc=host \
-e http_proxy=$http_proxy \
-e https_proxy=$https_proxy \
-e LLM_MODEL_ID=$LLM_MODEL_ID \
-e LLM_ENDPOINT=$LLM_ENDPOINT \
-e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN \
-e FAQGen_COMPONENT_NAME=$FAQGen_COMPONENT_NAME \
opea/llm-faqgen:latest
```
#### 1.3.2 Run Docker with Docker Compose (Option B)
```bash
cd ../../deployment/docker_compose/
# Backend is TGI on xeon
docker compose -f faq-generation_tgi.yaml up -d
# Backend is TGI on gaudi
# docker compose -f faq-generation_tgi_on_intel_hpu.yaml up -d
# Backend is vLLM on xeon
# docker compose -f faq-generation_vllm.yaml up -d
# Backend is vLLM on gaudi
# docker compose -f faq-generation_vllm_on_intel_hpu.yaml up -d
```
## 🚀2. Consume LLM Service
### 2.1 Check Service Status
```bash
curl http://${host_ip}:${FAQ_PORT}/v1/health_check\
-X GET \
-H 'Content-Type: application/json'
```
### 2.2 Consume FAQGen LLM Service
```bash
# Streaming Response
# Set stream to True. Default will be True.
curl http://${host_ip}:${FAQ_PORT}/v1/faqgen \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens": 128}' \
-H 'Content-Type: application/json'
# Non-Streaming Response
# Set stream to False.
curl http://${host_ip}:${FAQ_PORT}/v1/faqgen \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens": 128, "stream":false}' \
-H 'Content-Type: application/json'
```

View File

@@ -5,4 +5,4 @@
pip --no-cache-dir install -r requirements-runtime.txt
python llm.py
python opea_faqgen_microservice.py

View File

@@ -0,0 +1,110 @@
# Copyright (C) 2024 Prediction Guard, Inc.
# SPDX-License-Identified: Apache-2.0
import os
import requests
from fastapi.responses import StreamingResponse
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, OpeaComponent, ServiceType
from comps.cores.mega.utils import ConfigError, get_access_token, load_model_configs
logger = CustomLogger("opea_faqgen")
logflag = os.getenv("LOGFLAG", False)
templ = """Create a concise FAQs (frequently asked questions and answers) for following text:
TEXT: {text}
Do not use any prefix or suffix to the FAQ.
"""
# Environment variables
MODEL_NAME = os.getenv("LLM_MODEL_ID")
MODEL_CONFIGS = os.getenv("MODEL_CONFIGS")
TOKEN_URL = os.getenv("TOKEN_URL")
CLIENTID = os.getenv("CLIENTID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
if os.getenv("LLM_ENDPOINT") is not None:
DEFAULT_ENDPOINT = os.getenv("LLM_ENDPOINT")
elif os.getenv("TGI_LLM_ENDPOINT") is not None:
DEFAULT_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT")
elif os.getenv("vLLM_ENDPOINT") is not None:
DEFAULT_ENDPOINT = os.getenv("vLLM_ENDPOINT")
else:
DEFAULT_ENDPOINT = "http://localhost:8080"
def get_llm_endpoint():
if not MODEL_CONFIGS:
return DEFAULT_ENDPOINT
else:
# Validate and Load the models config if MODEL_CONFIGS is not null
configs_map = {}
try:
configs_map = load_model_configs(MODEL_CONFIGS)
except ConfigError as e:
logger.error(f"Failed to load model configurations: {e}")
raise ConfigError(f"Failed to load model configurations: {e}")
try:
return configs_map.get(MODEL_NAME).get("endpoint")
except ConfigError as e:
logger.error(f"Input model {MODEL_NAME} not present in model_configs. Error {e}")
raise ConfigError(f"Input model {MODEL_NAME} not present in model_configs")
class OPEAFAQGen(OpeaComponent):
"""A specialized OPEA FAQGen component derived from OpeaComponent.
Attributes:
client (TGI/vLLM): An instance of the TGI/vLLM client for text generation.
"""
def __init__(self, name: str, description: str, config: dict = None):
super().__init__(name, ServiceType.LLM.name.lower(), description, config)
self.access_token = (
get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
)
self.text_splitter = CharacterTextSplitter()
self.llm_endpoint = get_llm_endpoint()
health_status = self.check_health()
if not health_status:
logger.error("OPEAFAQGen health check failed.")
async def generate(self, input: LLMParamsDoc, client):
"""Invokes the TGI/vLLM LLM service to generate FAQ output for the provided input.
Args:
input (LLMParamsDoc): The input text(s).
client: TGI/vLLM based client
"""
PROMPT = PromptTemplate.from_template(templ)
llm_chain = load_summarize_chain(llm=client, prompt=PROMPT)
texts = self.text_splitter.split_text(input.query)
# Create multiple documents
docs = [Document(page_content=t) for t in texts]
if input.stream:
async def stream_generator():
from langserve.serialization import WellKnownLCSerializer
_serializer = WellKnownLCSerializer()
async for chunk in llm_chain.astream_log(docs):
data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
if logflag:
logger.info(data)
yield f"data: {data}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
response = await llm_chain.ainvoke(docs)
response = response["output_text"]
if logflag:
logger.info(response)
return GeneratedDoc(text=response, prompt=input.query)

View File

@@ -0,0 +1,73 @@
# Copyright (C) 2024 Prediction Guard, Inc.
# SPDX-License-Identified: Apache-2.0
import os
import requests
from langchain_community.llms import HuggingFaceEndpoint
from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
from .common import *
logger = CustomLogger("opea_faqgen_tgi")
logflag = os.getenv("LOGFLAG", False)
@OpeaComponentRegistry.register("OPEAFAQGen_TGI")
class OPEAFAQGen_TGI(OPEAFAQGen):
"""A specialized OPEA FAQGen TGI component derived from OPEAFAQGen for interacting with TGI services based on Lanchain HuggingFaceEndpoint API.
Attributes:
client (TGI): An instance of the TGI client for text generation.
"""
def check_health(self) -> bool:
"""Checks the health of the TGI LLM service.
Returns:
bool: True if the service is reachable and healthy, False otherwise.
"""
try:
# response = requests.get(f"{self.llm_endpoint}/health")
# Will remove after TGI gaudi fix health bug
url = f"{self.llm_endpoint}/generate"
data = {"inputs": "What is Deep Learning?", "parameters": {"max_new_tokens": 17}}
headers = {"Content-Type": "application/json"}
response = requests.post(url=url, json=data, headers=headers)
if response.status_code == 200:
return True
else:
return False
except Exception as e:
logger.error(e)
logger.error("Health check failed")
return False
async def invoke(self, input: LLMParamsDoc):
"""Invokes the TGI LLM service to generate FAQ output for the provided input.
Args:
input (LLMParamsDoc): The input text(s).
"""
server_kwargs = {}
if self.access_token:
server_kwargs["headers"] = {"Authorization": f"Bearer {self.access_token}"}
self.client = HuggingFaceEndpoint(
endpoint_url=self.llm_endpoint,
max_new_tokens=input.max_tokens,
top_k=input.top_k,
top_p=input.top_p,
typical_p=input.typical_p,
temperature=input.temperature,
repetition_penalty=input.repetition_penalty,
streaming=input.stream,
server_kwargs=server_kwargs,
)
result = await self.generate(input, self.client)
return result

View File

@@ -0,0 +1,65 @@
# Copyright (C) 2024 Prediction Guard, Inc.
# SPDX-License-Identified: Apache-2.0
import os
import requests
from langchain_community.llms import VLLMOpenAI
from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
from .common import *
logger = CustomLogger("opea_faqgen_vllm")
logflag = os.getenv("LOGFLAG", False)
@OpeaComponentRegistry.register("OPEAFAQGen_vLLM")
class OPEAFAQGen_vLLM(OPEAFAQGen):
"""A specialized OPEA FAQGen vLLM component derived from OPEAFAQGen for interacting with vLLM services based on Lanchain VLLMOpenAI API.
Attributes:
client (vLLM): An instance of the vLLM client for text generation.
"""
def check_health(self) -> bool:
"""Checks the health of the vLLM LLM service.
Returns:
bool: True if the service is reachable and healthy, False otherwise.
"""
try:
response = requests.get(f"{self.llm_endpoint}/health")
if response.status_code == 200:
return True
else:
return False
except Exception as e:
logger.error(e)
logger.error("Health check failed")
return False
async def invoke(self, input: LLMParamsDoc):
"""Invokes the vLLM LLM service to generate FAQ output for the provided input.
Args:
input (LLMParamsDoc): The input text(s).
"""
headers = {}
if self.access_token:
headers = {"Authorization": f"Bearer {self.access_token}"}
self.client = VLLMOpenAI(
openai_api_key="EMPTY",
openai_api_base=self.llm_endpoint + "/v1",
model_name=MODEL_NAME,
default_headers=headers,
max_tokens=input.max_tokens,
top_p=input.top_p,
streaming=input.stream,
temperature=input.temperature,
)
result = await self.generate(input, self.client)
return result

View File

@@ -0,0 +1,58 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
import time
from integrations.tgi import OPEAFAQGen_TGI
from integrations.vllm import OPEAFAQGen_vLLM
from comps import (
CustomLogger,
LLMParamsDoc,
OpeaComponentLoader,
ServiceType,
opea_microservices,
register_microservice,
register_statistics,
statistics_dict,
)
logger = CustomLogger("llm_faqgen")
logflag = os.getenv("LOGFLAG", False)
llm_component_name = os.getenv("FAQGen_COMPONENT_NAME", "OPEAFAQGen_TGI")
# Initialize OpeaComponentLoader
loader = OpeaComponentLoader(llm_component_name, description=f"OPEA LLM FAQGen Component: {llm_component_name}")
@register_microservice(
name="opea_service@llm_faqgen",
service_type=ServiceType.LLM,
endpoint="/v1/faqgen",
host="0.0.0.0",
port=9000,
)
@register_statistics(names=["opea_service@llm_faqgen"])
async def llm_generate(input: LLMParamsDoc):
start = time.time()
# Log the input if logging is enabled
if logflag:
logger.info(input)
try:
# Use the controller to invoke the active component
response = await loader.invoke(input)
# Record statistics
statistics_dict["opea_service@llm_faqgen"].append_latency(time.time() - start, None)
return response
except Exception as e:
logger.error(f"Error during FaqGen invocation: {e}")
raise
if __name__ == "__main__":
logger.info("OPEA FAQGen Microservice is starting...")
opea_microservices["opea_service@llm_faqgen"].start()