EdgeCraftRAG: Fix multiple issues (#1143)
Signed-off-by: Mingyuan Qi <mingyuan.qi@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
@@ -13,13 +13,11 @@ RUN useradd -m -s /bin/bash user && \
|
||||
mkdir -p /home/user && \
|
||||
chown -R user /home/user/
|
||||
|
||||
COPY ./edgecraftrag /home/user/edgecraftrag
|
||||
COPY ./requirements.txt /home/user/requirements.txt
|
||||
COPY ./chatqna.py /home/user/chatqna.py
|
||||
|
||||
WORKDIR /home/user/edgecraftrag
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
WORKDIR /home/user
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
USER user
|
||||
|
||||
|
||||
@@ -25,6 +25,9 @@ RUN useradd -m -s /bin/bash user && \
|
||||
|
||||
COPY ./edgecraftrag /home/user/edgecraftrag
|
||||
|
||||
RUN mkdir -p /home/user/gradio_cache
|
||||
ENV GRADIO_TEMP_DIR=/home/user/gradio_cache
|
||||
|
||||
WORKDIR /home/user/edgecraftrag
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
|
||||
@@ -7,39 +7,112 @@ quality and performance.
|
||||
|
||||
## Quick Start Guide
|
||||
|
||||
### Run Containers with Docker Compose
|
||||
### (Optional) Build Docker Images for Mega Service, Server and UI by your own
|
||||
|
||||
If you want to build the images by your own, please follow the steps:
|
||||
|
||||
```bash
|
||||
cd GenAIExamples/EdgeCraftRAG
|
||||
|
||||
docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --build-arg no_proxy=$no_proxy -t opea/edgecraftrag:latest -f Dockerfile .
|
||||
docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --build-arg no_proxy=$no_proxy -t opea/edgecraftrag-server:latest -f Dockerfile.server .
|
||||
docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --build-arg no_proxy=$no_proxy -t opea/edgecraftrag-ui:latest -f ui/docker/Dockerfile.ui .
|
||||
```
|
||||
|
||||
### Using Intel Arc GPU
|
||||
|
||||
#### Local inference with OpenVINO for Intel Arc GPU
|
||||
|
||||
You can select "local" type in generation field which is the default approach to enable Intel Arc GPU for LLM. You don't need to build images for "local" type.
|
||||
|
||||
#### vLLM with OpenVINO for Intel Arc GPU
|
||||
|
||||
You can also select "vLLM" as generation type, to enable this type, you'll need to build the vLLM image for Intel Arc GPU before service bootstrap.
|
||||
Please follow this link [vLLM with OpenVINO](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation/vllm/langchain#build-docker-image) to build the vLLM image.
|
||||
|
||||
### Start Edge Craft RAG Services with Docker Compose
|
||||
|
||||
If you want to enable vLLM with OpenVINO service, please finish the steps in [Launch vLLM with OpenVINO service](#optional-launch-vllm-with-openvino-service) first.
|
||||
|
||||
```bash
|
||||
cd GenAIExamples/EdgeCraftRAG/docker_compose/intel/gpu/arc
|
||||
|
||||
export MODEL_PATH="your model path for all your models"
|
||||
export DOC_PATH="your doc path for uploading a dir of files"
|
||||
export HOST_IP="your host ip"
|
||||
export UI_SERVICE_PORT="port for UI service"
|
||||
export GRADIO_PATH="your gradio cache path for transferring files"
|
||||
|
||||
# Optional for vllm endpoint
|
||||
export vLLM_ENDPOINT="http://${HOST_IP}:8008"
|
||||
# Make sure all 3 folders have 1000:1000 permission, otherwise
|
||||
# chown 1000:1000 ${MODEL_PATH} ${DOC_PATH} ${GRADIO_PATH}
|
||||
|
||||
# Use `ip a` to check your active ip
|
||||
export HOST_IP="your host ip"
|
||||
|
||||
# Check group id of video and render
|
||||
export VIDEOGROUPID=$(getent group video | cut -d: -f3)
|
||||
export RENDERGROUPID=$(getent group render | cut -d: -f3)
|
||||
|
||||
# If you have a proxy configured, uncomment below line
|
||||
# export no_proxy=$no_proxy,${HOST_IP},edgecraftrag,edgecraftrag-server
|
||||
# export no_proxy=${no_proxy},${HOST_IP},edgecraftrag,edgecraftrag-server
|
||||
# export NO_PROXY=${NO_PROXY},${HOST_IP},edgecraftrag,edgecraftrag-server
|
||||
# If you have a HF mirror configured, it will be imported to the container
|
||||
# export HF_ENDPOINT="your HF mirror endpoint"
|
||||
|
||||
# By default, the ports of the containers are set, uncomment if you want to change
|
||||
# export MEGA_SERVICE_PORT=16011
|
||||
# export PIPELINE_SERVICE_PORT=16011
|
||||
# export UI_SERVICE_PORT="8082"
|
||||
|
||||
# Prepare models for embedding, reranking and generation, you can also choose other OpenVINO optimized models
|
||||
# Here is the example:
|
||||
pip install --upgrade --upgrade-strategy eager "optimum[openvino]"
|
||||
|
||||
optimum-cli export openvino -m BAAI/bge-small-en-v1.5 ${MODEL_PATH}/BAAI/bge-small-en-v1.5 --task sentence-similarity
|
||||
optimum-cli export openvino -m BAAI/bge-reranker-large ${MODEL_PATH}/BAAI/bge-reranker-large --task sentence-similarity
|
||||
optimum-cli export openvino -m Qwen/Qwen2-7B-Instruct ${MODEL_PATH}/Qwen/Qwen2-7B-Instruct/INT4_compressed_weights --weight-format int4
|
||||
|
||||
docker compose up -d
|
||||
|
||||
```
|
||||
|
||||
### (Optional) Build Docker Images for Mega Service, Server and UI by your own
|
||||
#### (Optional) Launch vLLM with OpenVINO service
|
||||
|
||||
1. Set up Environment Variables
|
||||
|
||||
```bash
|
||||
cd GenAIExamples/EdgeCraftRAG
|
||||
export LLM_MODEL=#your model id
|
||||
export VLLM_SERVICE_PORT=8008
|
||||
export vLLM_ENDPOINT="http://${HOST_IP}:${VLLM_SERVICE_PORT}"
|
||||
export HUGGINGFACEHUB_API_TOKEN=#your HF token
|
||||
```
|
||||
|
||||
docker build --build-arg http_proxy=$HTTP_PROXY --build-arg https_proxy=$HTTPS_PROXY --build-arg no_proxy=$NO_PROXY -t opea/edgecraftrag:latest -f Dockerfile .
|
||||
docker build --build-arg http_proxy=$HTTP_PROXY --build-arg https_proxy=$HTTPS_PROXY --build-arg no_proxy=$NO_PROXY -t opea/edgecraftrag-server:latest -f Dockerfile.server .
|
||||
docker build --build-arg http_proxy=$HTTP_PROXY --build-arg https_proxy=$HTTPS_PROXY --build-arg no_proxy=$NO_PROXY -t opea/edgecraftrag-ui:latest -f ui/docker/Dockerfile.ui .
|
||||
2. Uncomment below code in 'GenAIExamples/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml'
|
||||
|
||||
```bash
|
||||
# vllm-openvino-server:
|
||||
# container_name: vllm-openvino-server
|
||||
# image: opea/vllm-arc:latest
|
||||
# ports:
|
||||
# - ${VLLM_SERVICE_PORT:-8008}:80
|
||||
# environment:
|
||||
# HTTPS_PROXY: ${https_proxy}
|
||||
# HTTP_PROXY: ${https_proxy}
|
||||
# VLLM_OPENVINO_DEVICE: GPU
|
||||
# HF_ENDPOINT: ${HF_ENDPOINT}
|
||||
# HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
# volumes:
|
||||
# - /dev/dri/by-path:/dev/dri/by-path
|
||||
# - $HOME/.cache/huggingface:/root/.cache/huggingface
|
||||
# devices:
|
||||
# - /dev/dri
|
||||
# entrypoint: /bin/bash -c "\
|
||||
# cd / && \
|
||||
# export VLLM_CPU_KVCACHE_SPACE=50 && \
|
||||
# export VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON && \
|
||||
# python3 -m vllm.entrypoints.openai.api_server \
|
||||
# --model '${LLM_MODEL}' \
|
||||
# --max_model_len=1024 \
|
||||
# --host 0.0.0.0 \
|
||||
# --port 80"
|
||||
```
|
||||
|
||||
### ChatQnA with LLM Example (Command Line)
|
||||
@@ -109,7 +182,7 @@ curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: app
|
||||
# }
|
||||
|
||||
# Prepare data from local directory
|
||||
curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"local_path":"#REPLACE WITH YOUR LOCAL DOC DIR#"}' | jq '.'
|
||||
curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"local_path":"docs/#REPLACE WITH YOUR DIR WITHIN MOUNTED DOC PATH#"}' | jq '.'
|
||||
|
||||
# Validate Mega Service
|
||||
curl -X POST http://${HOST_IP}:16011/v1/chatqna -H "Content-Type: application/json" -d '{"messages":"#REPLACE WITH YOUR QUESTION HERE#", "top_n":5, "max_tokens":512}' | jq '.'
|
||||
@@ -121,33 +194,14 @@ Open your browser, access http://${HOST_IP}:8082
|
||||
|
||||
> Your browser should be running on the same host of your console, otherwise you will need to access UI with your host domain name instead of ${HOST_IP}.
|
||||
|
||||
### (Optional) Launch vLLM with OpenVINO service
|
||||
To create a default pipeline, you need to click the `Create Pipeline` button on the `RAG Settings` page. You can also create multiple pipelines or update existing pipelines through the `Pipeline Configuration`, but please note that active pipelines cannot be updated.
|
||||

|
||||
|
||||
```bash
|
||||
# 1. export LLM_MODEL
|
||||
export LLM_MODEL="your model id"
|
||||
# 2. Uncomment below code in 'GenAIExamples/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml'
|
||||
# vllm-service:
|
||||
# image: vllm:openvino
|
||||
# container_name: vllm-openvino-server
|
||||
# depends_on:
|
||||
# - vllm-service
|
||||
# ports:
|
||||
# - "8008:80"
|
||||
# environment:
|
||||
# no_proxy: ${no_proxy}
|
||||
# http_proxy: ${http_proxy}
|
||||
# https_proxy: ${https_proxy}
|
||||
# vLLM_ENDPOINT: ${vLLM_ENDPOINT}
|
||||
# LLM_MODEL: ${LLM_MODEL}
|
||||
# entrypoint: /bin/bash -c "\
|
||||
# cd / && \
|
||||
# export VLLM_CPU_KVCACHE_SPACE=50 && \
|
||||
# python3 -m vllm.entrypoints.openai.api_server \
|
||||
# --model '${LLM_MODEL}' \
|
||||
# --host 0.0.0.0 \
|
||||
# --port 80"
|
||||
```
|
||||
After the pipeline creation, you can upload your data in the `Chatbot` page.
|
||||

|
||||
|
||||
Then, you can submit messages in the chat box.
|
||||

|
||||
|
||||
## Advanced User Guide
|
||||
|
||||
@@ -156,27 +210,13 @@ export LLM_MODEL="your model id"
|
||||
#### Create a pipeline
|
||||
|
||||
```bash
|
||||
curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" -d @examples/test_pipeline.json | jq '.'
|
||||
curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" -d @tests/test_pipeline_local_llm.json | jq '.'
|
||||
```
|
||||
|
||||
It will take some time to prepare the embedding model.
|
||||
|
||||
#### Upload a text
|
||||
#### Update a pipeline
|
||||
|
||||
```bash
|
||||
curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data.json | jq '.'
|
||||
```
|
||||
|
||||
#### Provide a query to retrieve context with similarity search.
|
||||
|
||||
```bash
|
||||
curl -X POST http://${HOST_IP}:16010/v1/retrieval -H "Content-Type: application/json" -d @examples/test_query.json | jq '.'
|
||||
```
|
||||
|
||||
#### Create the second pipeline test2
|
||||
|
||||
```bash
|
||||
curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" -d @examples/test_pipeline2.json | jq '.'
|
||||
curl -X PATCH http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" -d @tests/test_pipeline_local_llm.json | jq '.'
|
||||
```
|
||||
|
||||
#### Check all pipelines
|
||||
@@ -185,19 +225,10 @@ curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: app
|
||||
curl -X GET http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" | jq '.'
|
||||
```
|
||||
|
||||
#### Compare similarity retrieval (test1) and keyword retrieval (test2)
|
||||
#### Activate a pipeline
|
||||
|
||||
```bash
|
||||
# Activate pipeline test1
|
||||
curl -X PATCH http://${HOST_IP}:16010/v1/settings/pipelines/test1 -H "Content-Type: application/json" -d '{"active": "true"}' | jq '.'
|
||||
# Similarity retrieval
|
||||
curl -X POST http://${HOST_IP}:16010/v1/retrieval -H "Content-Type: application/json" -d '{"messages":"number"}' | jq '.'
|
||||
|
||||
# Activate pipeline test2
|
||||
curl -X PATCH http://${HOST_IP}:16010/v1/settings/pipelines/test2 -H "Content-Type: application/json" -d '{"active": "true"}' | jq '.'
|
||||
# Keyword retrieval
|
||||
curl -X POST http://${HOST_IP}:16010/v1/retrieval -H "Content-Type: application/json" -d '{"messages":"number"}' | jq '.'
|
||||
|
||||
```
|
||||
|
||||
### Model Management
|
||||
@@ -205,7 +236,7 @@ curl -X POST http://${HOST_IP}:16010/v1/retrieval -H "Content-Type: application/
|
||||
#### Load a model
|
||||
|
||||
```bash
|
||||
curl -X POST http://${HOST_IP}:16010/v1/settings/models -H "Content-Type: application/json" -d @examples/test_model_load.json | jq '.'
|
||||
curl -X POST http://${HOST_IP}:16010/v1/settings/models -H "Content-Type: application/json" -d '{"model_type": "reranker", "model_id": "BAAI/bge-reranker-large", "model_path": "./models/bge_ov_reranker", "device": "cpu"}' | jq '.'
|
||||
```
|
||||
|
||||
It will take some time to load the model.
|
||||
@@ -219,7 +250,7 @@ curl -X GET http://${HOST_IP}:16010/v1/settings/models -H "Content-Type: applica
|
||||
#### Update a model
|
||||
|
||||
```bash
|
||||
curl -X PATCH http://${HOST_IP}:16010/v1/settings/models/BAAI/bge-reranker-large -H "Content-Type: application/json" -d @examples/test_model_update.json | jq '.'
|
||||
curl -X PATCH http://${HOST_IP}:16010/v1/settings/models/BAAI/bge-reranker-large -H "Content-Type: application/json" -d '{"model_type": "reranker", "model_id": "BAAI/bge-reranker-large", "model_path": "./models/bge_ov_reranker", "device": "gpu"}' | jq '.'
|
||||
```
|
||||
|
||||
#### Check a certain model
|
||||
@@ -239,14 +270,14 @@ curl -X DELETE http://${HOST_IP}:16010/v1/settings/models/BAAI/bge-reranker-larg
|
||||
#### Add a text
|
||||
|
||||
```bash
|
||||
curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data.json | jq '.'
|
||||
curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"text":"#REPLACE WITH YOUR TEXT"}' | jq '.'
|
||||
```
|
||||
|
||||
#### Add files from existed file path
|
||||
|
||||
```bash
|
||||
curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data_dir.json | jq '.'
|
||||
curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data_file.json | jq '.'
|
||||
curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"local_path":"docs/#REPLACE WITH YOUR DIR WITHIN MOUNTED DOC PATH#"}' | jq '.'
|
||||
curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"local_path":"docs/#REPLACE WITH YOUR FILE WITHIN MOUNTED DOC PATH#"}' | jq '.'
|
||||
```
|
||||
|
||||
#### Check all files
|
||||
@@ -270,5 +301,5 @@ curl -X DELETE http://${HOST_IP}:16010/v1/data/files/test2.docx -H "Content-Type
|
||||
#### Update a file
|
||||
|
||||
```bash
|
||||
curl -X PATCH http://${HOST_IP}:16010/v1/data/files/test.pdf -H "Content-Type: application/json" -d @examples/test_data_file.json | jq '.'
|
||||
curl -X PATCH http://${HOST_IP}:16010/v1/data/files/test.pdf -H "Content-Type: application/json" -d '{"local_path":"docs/#REPLACE WITH YOUR FILE WITHIN MOUNTED DOC PATH#"}' | jq '.'
|
||||
```
|
||||
|
||||
BIN
EdgeCraftRAG/assets/img/chat_with_rag.png
Normal file
BIN
EdgeCraftRAG/assets/img/chat_with_rag.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 92 KiB |
BIN
EdgeCraftRAG/assets/img/create_pipeline.png
Normal file
BIN
EdgeCraftRAG/assets/img/create_pipeline.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 168 KiB |
BIN
EdgeCraftRAG/assets/img/upload_data.png
Normal file
BIN
EdgeCraftRAG/assets/img/upload_data.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 86 KiB |
@@ -18,6 +18,7 @@ from comps.cores.proto.api_protocol import (
|
||||
ChatMessage,
|
||||
UsageInfo,
|
||||
)
|
||||
from comps.cores.proto.docarray import LLMParams
|
||||
from fastapi import Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
@@ -30,7 +31,20 @@ class EdgeCraftRagGateway(Gateway):
|
||||
|
||||
async def handle_request(self, request: Request):
|
||||
input = await request.json()
|
||||
result_dict, runtime_graph = await self.megaservice.schedule(initial_inputs=input)
|
||||
stream_opt = input.get("stream", False)
|
||||
chat_request = ChatCompletionRequest.parse_obj(input)
|
||||
parameters = LLMParams(
|
||||
max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024,
|
||||
top_k=chat_request.top_k if chat_request.top_k else 10,
|
||||
top_p=chat_request.top_p if chat_request.top_p else 0.95,
|
||||
temperature=chat_request.temperature if chat_request.temperature else 0.01,
|
||||
frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0,
|
||||
presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0,
|
||||
repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03,
|
||||
streaming=stream_opt,
|
||||
chat_template=chat_request.chat_template if chat_request.chat_template else None,
|
||||
)
|
||||
result_dict, runtime_graph = await self.megaservice.schedule(initial_inputs=input, llm_parameters=parameters)
|
||||
for node, response in result_dict.items():
|
||||
if isinstance(response, StreamingResponse):
|
||||
return response
|
||||
@@ -61,7 +75,7 @@ class EdgeCraftRagService:
|
||||
port=PIPELINE_SERVICE_PORT,
|
||||
endpoint="/v1/chatqna",
|
||||
use_remote_service=True,
|
||||
service_type=ServiceType.UNDEFINED,
|
||||
service_type=ServiceType.LLM,
|
||||
)
|
||||
self.megaservice.add(edgecraftrag)
|
||||
self.gateway = EdgeCraftRagGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port)
|
||||
|
||||
@@ -14,12 +14,15 @@ services:
|
||||
volumes:
|
||||
- ${MODEL_PATH:-${PWD}}:/home/user/models
|
||||
- ${DOC_PATH:-${PWD}}:/home/user/docs
|
||||
- ${GRADIO_PATH:-${PWD}}:/home/user/gradio_cache
|
||||
- ${HF_CACHE:-${HOME}/.cache}:/home/user/.cache
|
||||
ports:
|
||||
- ${PIPELINE_SERVICE_PORT:-16010}:${PIPELINE_SERVICE_PORT:-16010}
|
||||
devices:
|
||||
- /dev/dri:/dev/dri
|
||||
group_add:
|
||||
- video
|
||||
- ${VIDEOGROUPID:-44}
|
||||
- ${RENDERGROUPID:-109}
|
||||
ecrag:
|
||||
image: ${REGISTRY:-opea}/edgecraftrag:${TAG:-latest}
|
||||
container_name: edgecraftrag
|
||||
@@ -48,31 +51,42 @@ services:
|
||||
PIPELINE_SERVICE_HOST_IP: ${PIPELINE_SERVICE_HOST_IP:-${HOST_IP}}
|
||||
UI_SERVICE_PORT: ${UI_SERVICE_PORT:-8082}
|
||||
UI_SERVICE_HOST_IP: ${UI_SERVICE_HOST_IP:-0.0.0.0}
|
||||
volumes:
|
||||
- ${GRADIO_PATH:-${PWD}}:/home/user/gradio_cache
|
||||
ports:
|
||||
- ${UI_SERVICE_PORT:-8082}:${UI_SERVICE_PORT:-8082}
|
||||
- ${UI_SERVICE_PORT:-8082}:${UI_SERVICE_PORT:-8082}
|
||||
restart: always
|
||||
depends_on:
|
||||
- server
|
||||
- ecrag
|
||||
# vllm-service:
|
||||
# image: vllm:openvino
|
||||
# vllm-openvino-server:
|
||||
# container_name: vllm-openvino-server
|
||||
# image: opea/vllm-arc:latest
|
||||
# ports:
|
||||
# - "8008:80"
|
||||
# - ${VLLM_SERVICE_PORT:-8008}:80
|
||||
# environment:
|
||||
# no_proxy: ${no_proxy}
|
||||
# http_proxy: ${http_proxy}
|
||||
# https_proxy: ${https_proxy}
|
||||
# vLLM_ENDPOINT: ${vLLM_ENDPOINT}
|
||||
# LLM_MODEL: ${LLM_MODEL}
|
||||
# HTTPS_PROXY: ${https_proxy}
|
||||
# HTTP_PROXY: ${https_proxy}
|
||||
# VLLM_OPENVINO_DEVICE: GPU
|
||||
# HF_ENDPOINT: ${HF_ENDPOINT}
|
||||
# HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
|
||||
# volumes:
|
||||
# - /dev/dri/by-path:/dev/dri/by-path
|
||||
# - $HOME/.cache/huggingface:/root/.cache/huggingface
|
||||
# devices:
|
||||
# - /dev/dri
|
||||
# group_add:
|
||||
# - ${VIDEOGROUPID:-44}
|
||||
# - ${RENDERGROUPID:-109}
|
||||
# entrypoint: /bin/bash -c "\
|
||||
# cd / && \
|
||||
# export VLLM_CPU_KVCACHE_SPACE=50 && \
|
||||
# export VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON && \
|
||||
# python3 -m vllm.entrypoints.openai.api_server \
|
||||
# --model '${LLM_MODEL}' \
|
||||
# --max_model_len=1024 \
|
||||
# --host 0.0.0.0 \
|
||||
# --port 80"
|
||||
|
||||
networks:
|
||||
default:
|
||||
driver: bridge
|
||||
|
||||
@@ -25,5 +25,8 @@ async def retrieval(request: ChatCompletionRequest):
|
||||
# ChatQnA
|
||||
@chatqna_app.post(path="/v1/chatqna")
|
||||
async def chatqna(request: ChatCompletionRequest):
|
||||
ret = ctx.get_pipeline_mgr().run_pipeline(chat_request=request)
|
||||
return str(ret)
|
||||
if request.stream:
|
||||
return ctx.get_pipeline_mgr().run_pipeline(chat_request=request)
|
||||
else:
|
||||
ret = ctx.get_pipeline_mgr().run_pipeline(chat_request=request)
|
||||
return str(ret)
|
||||
|
||||
@@ -157,16 +157,13 @@ def update_pipeline_handler(pl, req):
|
||||
gen = req.generator
|
||||
if gen.model is None:
|
||||
return "No ChatQnA Model"
|
||||
if gen.inference_type == InferenceType.VLLM:
|
||||
if gen.model.model_id:
|
||||
model_ref = gen.model.model_id
|
||||
else:
|
||||
model_ref = gen.model.model_path
|
||||
pl.generator = QnAGenerator(model_ref, gen.prompt_path, gen.inference_type)
|
||||
elif gen.inference_type == InferenceType.LOCAL:
|
||||
if gen.inference_type:
|
||||
model = ctx.get_model_mgr().search_model(gen.model)
|
||||
if model is None:
|
||||
gen.model.model_type = ModelType.LLM
|
||||
if gen.inference_type == InferenceType.VLLM:
|
||||
gen.model.model_type = ModelType.VLLM
|
||||
else:
|
||||
gen.model.model_type = ModelType.LLM
|
||||
model = ctx.get_model_mgr().load_model(gen.model)
|
||||
ctx.get_model_mgr().add(model)
|
||||
# Use weakref to achieve model deletion and memory release
|
||||
|
||||
@@ -10,6 +10,7 @@ class ModelIn(BaseModel):
|
||||
model_type: Optional[str] = "LLM"
|
||||
model_id: Optional[str]
|
||||
model_path: Optional[str] = "./"
|
||||
weight: Optional[str]
|
||||
device: Optional[str] = "cpu"
|
||||
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@ class ModelType(str, Enum):
|
||||
EMBEDDING = "embedding"
|
||||
RERANKER = "reranker"
|
||||
LLM = "llm"
|
||||
VLLM = "vllm"
|
||||
|
||||
|
||||
class FileType(str, Enum):
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import asyncio
|
||||
import dataclasses
|
||||
import os
|
||||
|
||||
from comps import GeneratedDoc, opea_telemetry
|
||||
from comps import GeneratedDoc
|
||||
from edgecraftrag.base import BaseComponent, CompType, GeneratorType
|
||||
from fastapi.responses import StreamingResponse
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
@@ -12,18 +13,6 @@ from llama_index.llms.openai_like import OpenAILike
|
||||
from pydantic import model_serializer
|
||||
|
||||
|
||||
@opea_telemetry
|
||||
def post_process_text(text: str):
|
||||
if text == " ":
|
||||
return "data: @#$\n\n"
|
||||
if text == "\n":
|
||||
return "data: <br/>\n\n"
|
||||
if text.isspace():
|
||||
return None
|
||||
new_text = text.replace(" ", "@#$")
|
||||
return f"data: {new_text}\n\n"
|
||||
|
||||
|
||||
class QnAGenerator(BaseComponent):
|
||||
|
||||
def __init__(self, llm_model, prompt_template, inference_type, **kwargs):
|
||||
@@ -76,8 +65,18 @@ class QnAGenerator(BaseComponent):
|
||||
repetition_penalty=chat_request.repetition_penalty,
|
||||
)
|
||||
self.llm().generate_kwargs = generate_kwargs
|
||||
if chat_request.stream:
|
||||
|
||||
return self.llm().complete(prompt_str)
|
||||
async def stream_generator():
|
||||
response = self.llm().stream_complete(prompt_str)
|
||||
for r in response:
|
||||
yield r.delta
|
||||
# Simulate asynchronous operation
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
return StreamingResponse(stream_generator(), media_type="text/event-stream")
|
||||
else:
|
||||
return self.llm().complete(prompt_str)
|
||||
|
||||
def run_vllm(self, chat_request, retrieved_nodes, **kwargs):
|
||||
if self.llm is None:
|
||||
@@ -92,7 +91,7 @@ class QnAGenerator(BaseComponent):
|
||||
prompt_str = self.prompt.format(input=query, context=text_gen_context)
|
||||
|
||||
llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8008")
|
||||
model_name = self.llm
|
||||
model_name = self.llm().model_id
|
||||
llm = OpenAILike(
|
||||
api_key="fake",
|
||||
api_base=llm_endpoint + "/v1",
|
||||
@@ -106,12 +105,10 @@ class QnAGenerator(BaseComponent):
|
||||
if chat_request.stream:
|
||||
|
||||
async def stream_generator():
|
||||
response = await llm.astream_complete(prompt_str)
|
||||
async for text in response:
|
||||
output = text.text
|
||||
yield f"data: {output}\n\n"
|
||||
|
||||
yield "data: [DONE]\n\n"
|
||||
response = llm.stream_complete(prompt_str)
|
||||
for text in response:
|
||||
yield text.delta
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
return StreamingResponse(stream_generator(), media_type="text/event-stream")
|
||||
else:
|
||||
@@ -122,7 +119,12 @@ class QnAGenerator(BaseComponent):
|
||||
|
||||
@model_serializer
|
||||
def ser_model(self):
|
||||
set = {"idx": self.idx, "generator_type": self.comp_subtype, "model": self.model_id}
|
||||
set = {
|
||||
"idx": self.idx,
|
||||
"generator_type": self.comp_subtype,
|
||||
"inference_type": self.inference_type,
|
||||
"model": self.llm(),
|
||||
}
|
||||
return set
|
||||
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ class BaseModelComponent(BaseComponent):
|
||||
|
||||
model_id: Optional[str] = Field(default="")
|
||||
model_path: Optional[str] = Field(default="")
|
||||
weight: Optional[str] = Field(default="")
|
||||
device: Optional[str] = Field(default="cpu")
|
||||
|
||||
def run(self, **kwargs) -> Any:
|
||||
@@ -26,6 +27,7 @@ class BaseModelComponent(BaseComponent):
|
||||
"type": self.comp_subtype,
|
||||
"model_id": self.model_id,
|
||||
"model_path": self.model_path,
|
||||
"weight": self.weight,
|
||||
"device": self.device,
|
||||
}
|
||||
return set
|
||||
@@ -33,7 +35,7 @@ class BaseModelComponent(BaseComponent):
|
||||
|
||||
class OpenVINOEmbeddingModel(BaseModelComponent, OpenVINOEmbedding):
|
||||
|
||||
def __init__(self, model_id, model_path, device):
|
||||
def __init__(self, model_id, model_path, device, weight):
|
||||
OpenVINOEmbedding.create_and_save_openvino_model(model_id, model_path)
|
||||
OpenVINOEmbedding.__init__(self, model_id_or_path=model_path, device=device)
|
||||
self.comp_type = CompType.MODEL
|
||||
@@ -41,11 +43,12 @@ class OpenVINOEmbeddingModel(BaseModelComponent, OpenVINOEmbedding):
|
||||
self.model_id = model_id
|
||||
self.model_path = model_path
|
||||
self.device = device
|
||||
self.weight = ""
|
||||
|
||||
|
||||
class OpenVINORerankModel(BaseModelComponent, OpenVINORerank):
|
||||
|
||||
def __init__(self, model_id, model_path, device):
|
||||
def __init__(self, model_id, model_path, device, weight):
|
||||
OpenVINORerank.create_and_save_openvino_model(model_id, model_path)
|
||||
OpenVINORerank.__init__(
|
||||
self,
|
||||
@@ -57,11 +60,12 @@ class OpenVINORerankModel(BaseModelComponent, OpenVINORerank):
|
||||
self.model_id = model_id
|
||||
self.model_path = model_path
|
||||
self.device = device
|
||||
self.weight = ""
|
||||
|
||||
|
||||
class OpenVINOLLMModel(BaseModelComponent, OpenVINOLLM):
|
||||
|
||||
def __init__(self, model_id, model_path, device):
|
||||
def __init__(self, model_id, model_path, device, weight):
|
||||
OpenVINOLLM.__init__(
|
||||
self,
|
||||
model_id_or_path=model_path,
|
||||
@@ -72,3 +76,4 @@ class OpenVINOLLMModel(BaseModelComponent, OpenVINOLLM):
|
||||
self.model_id = model_id
|
||||
self.model_path = model_path
|
||||
self.device = device
|
||||
self.weight = weight
|
||||
|
||||
@@ -110,8 +110,10 @@ class Pipeline(BaseComponent):
|
||||
return True
|
||||
if self.generator:
|
||||
llm = self.generator.llm
|
||||
if llm() and llm().model_id == model_id:
|
||||
return True
|
||||
if isinstance(llm, str):
|
||||
return llm == model_id
|
||||
else:
|
||||
return llm().model_id == model_id
|
||||
return False
|
||||
|
||||
|
||||
@@ -154,7 +156,8 @@ def run_test_generator(pl: Pipeline, chat_request: ChatCompletionRequest) -> Any
|
||||
if pl.generator is None:
|
||||
return "No Generator Specified"
|
||||
if pl.generator.inference_type == InferenceType.LOCAL:
|
||||
answer = pl.generator.run(chat_request, retri_res)
|
||||
return pl.generator.run(chat_request, retri_res)
|
||||
elif pl.generator.inference_type == InferenceType.VLLM:
|
||||
answer = pl.generator.run_vllm(chat_request, retri_res)
|
||||
return answer
|
||||
return pl.generator.run_vllm(chat_request, retri_res)
|
||||
else:
|
||||
return "LLM inference_type not supported"
|
||||
|
||||
@@ -3,9 +3,14 @@
|
||||
|
||||
import asyncio
|
||||
|
||||
from edgecraftrag.api_schema import IndexerIn, ModelIn, NodeParserIn
|
||||
from edgecraftrag.base import BaseComponent, BaseMgr, CallbackType, ModelType
|
||||
from edgecraftrag.components.model import OpenVINOEmbeddingModel, OpenVINOLLMModel, OpenVINORerankModel
|
||||
from edgecraftrag.api_schema import ModelIn
|
||||
from edgecraftrag.base import BaseComponent, BaseMgr, CompType, ModelType
|
||||
from edgecraftrag.components.model import (
|
||||
BaseModelComponent,
|
||||
OpenVINOEmbeddingModel,
|
||||
OpenVINOLLMModel,
|
||||
OpenVINORerankModel,
|
||||
)
|
||||
|
||||
|
||||
class ModelMgr(BaseMgr):
|
||||
@@ -78,17 +83,25 @@ class ModelMgr(BaseMgr):
|
||||
model_id=model_para.model_id,
|
||||
model_path=model_para.model_path,
|
||||
device=model_para.device,
|
||||
weight=model_para.weight,
|
||||
)
|
||||
case ModelType.RERANKER:
|
||||
model = OpenVINORerankModel(
|
||||
model_id=model_para.model_id,
|
||||
model_path=model_para.model_path,
|
||||
device=model_para.device,
|
||||
weight=model_para.weight,
|
||||
)
|
||||
case ModelType.LLM:
|
||||
model = OpenVINOLLMModel(
|
||||
model_id=model_para.model_id,
|
||||
model_path=model_para.model_path,
|
||||
device=model_para.device,
|
||||
weight=model_para.weight,
|
||||
)
|
||||
case ModelType.VLLM:
|
||||
model = BaseModelComponent(model_id=model_para.model_id, model_path="", device="", weight="")
|
||||
model.comp_type = CompType.MODEL
|
||||
model.comp_subtype = ModelType.VLLM
|
||||
model.model_id_or_path = model_para.model_id
|
||||
return model
|
||||
|
||||
@@ -5,4 +5,4 @@
|
||||
<|im_start|>System: Pay attention to your formatting of response. If you need to reference content from context, try to keep the formatting.<|im_end|>
|
||||
<|im_start|>System: Try to summarize from the context, do some reasoning before response, then response. Make sure your response is logically sound and self-consistent.<|im_end|>
|
||||
|
||||
<|im_start|>{input}
|
||||
<|im_start|>{input}
|
||||
@@ -1,6 +1,5 @@
|
||||
docx2txt
|
||||
faiss-cpu>=1.8.0.post1
|
||||
gradio>=4.44.1
|
||||
langchain-core==0.2.29
|
||||
llama-index>=0.11.0
|
||||
llama-index-embeddings-openvino>=0.4.0
|
||||
@@ -9,8 +8,4 @@ llama-index-llms-openvino>=0.3.1
|
||||
llama-index-postprocessor-openvino-rerank>=0.3.0
|
||||
llama-index-retrievers-bm25>=0.3.0
|
||||
llama-index-vector-stores-faiss>=0.2.1
|
||||
loguru>=0.7.2
|
||||
omegaconf>=2.3.0
|
||||
opea-comps>=0.9
|
||||
py-cpuinfo>=9.0.0
|
||||
uvicorn>=0.30.6
|
||||
|
||||
2
EdgeCraftRAG/requirements.txt
Normal file
2
EdgeCraftRAG/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
fastapi>=0.115.0
|
||||
opea-comps>=0.9
|
||||
@@ -9,7 +9,6 @@
|
||||
"indexer_type": "faiss_vector",
|
||||
"embedding_model": {
|
||||
"model_id": "BAAI/bge-small-en-v1.5",
|
||||
"model_path": "./models/bge_ov_embedding",
|
||||
"device": "auto"
|
||||
}
|
||||
},
|
||||
@@ -23,7 +22,6 @@
|
||||
"top_n": 2,
|
||||
"reranker_model": {
|
||||
"model_id": "BAAI/bge-reranker-large",
|
||||
"model_path": "./models/bge_ov_reranker",
|
||||
"device": "auto"
|
||||
}
|
||||
}
|
||||
@@ -31,7 +29,6 @@
|
||||
"generator": {
|
||||
"model": {
|
||||
"model_id": "Qwen/Qwen2-7B-Instruct",
|
||||
"model_path": "./models/qwen2-7b-instruct/INT4_compressed_weights",
|
||||
"device": "cpu"
|
||||
},
|
||||
"prompt_path": "./edgecraftrag/prompt_template/default_prompt.txt",
|
||||
|
||||
@@ -11,10 +11,11 @@ RUN useradd -m -s /bin/bash user && \
|
||||
COPY ./ui/gradio /home/user/ui
|
||||
COPY ./edgecraftrag /home/user/edgecraftrag
|
||||
|
||||
WORKDIR /home/user/edgecraftrag
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN mkdir -p /home/user/gradio_cache
|
||||
ENV GRADIO_TEMP_DIR=/home/user/gradio_cache
|
||||
|
||||
WORKDIR /home/user/ui
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
USER user
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
|
||||
# Model language for LLM
|
||||
model_language: "Chinese"
|
||||
vector_db: "FAISS"
|
||||
splitter_name: "RecursiveCharacter"
|
||||
k_rerank: 5
|
||||
search_method: "similarity"
|
||||
@@ -29,21 +28,19 @@ k_retrieval: 30
|
||||
postprocessor: "reranker"
|
||||
|
||||
# Generator
|
||||
generator: "local"
|
||||
prompt_path: "./data/default_prompt.txt"
|
||||
generator: "chatqna"
|
||||
prompt_path: "./edgecraftrag/prompt_template/default_prompt.txt"
|
||||
|
||||
# Models
|
||||
embedding_model_id: "BAAI/bge-small-en-v1.5"
|
||||
embedding_model_path: "./bge_ov_embedding"
|
||||
# Device for embedding model inference
|
||||
embedding_device: "AUTO"
|
||||
|
||||
rerank_model_id: "BAAI/bge-reranker-large"
|
||||
rerank_model_path: "./bge_ov_reranker"
|
||||
# Device for reranking model inference
|
||||
rerank_device: "AUTO"
|
||||
|
||||
llm_model_id: "qwen2-7b-instruct"
|
||||
llm_model_path: "./qwen2-7b-instruct/INT4_compressed_weights"
|
||||
llm_model_id: "Qwen/Qwen2-7B-Instruct"
|
||||
llm_weights: "INT4"
|
||||
# Device for LLM model inference
|
||||
llm_device: "AUTO"
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import platform_config as pconf
|
||||
import requests
|
||||
|
||||
sys.path.append("..")
|
||||
import os
|
||||
|
||||
from edgecraftrag import api_schema
|
||||
|
||||
PIPELINE_SERVICE_HOST_IP = os.getenv("PIPELINE_SERVICE_HOST_IP", "127.0.0.1")
|
||||
@@ -42,6 +42,7 @@ def create_update_pipeline(
|
||||
vector_search_top_k,
|
||||
postprocessor,
|
||||
generator,
|
||||
llm_infertype,
|
||||
llm_id,
|
||||
llm_device,
|
||||
llm_weights,
|
||||
@@ -50,6 +51,7 @@ def create_update_pipeline(
|
||||
rerank_id,
|
||||
rerank_device,
|
||||
):
|
||||
llm_path = pconf.get_llm_model_dir("./models/", llm_id, llm_weights)
|
||||
req_dict = api_schema.PipelineCreateIn(
|
||||
name=name,
|
||||
active=active,
|
||||
@@ -60,9 +62,9 @@ def create_update_pipeline(
|
||||
indexer_type=indexer,
|
||||
embedding_model=api_schema.ModelIn(
|
||||
model_id=embedding_id,
|
||||
# TODO: remove hardcoding
|
||||
model_path="./bge_ov_embedding",
|
||||
model_path="./models/" + embedding_id,
|
||||
device=embedding_device,
|
||||
weight=llm_weights,
|
||||
),
|
||||
),
|
||||
retriever=api_schema.RetrieverIn(retriever_type=retriever, retriever_topk=vector_search_top_k),
|
||||
@@ -70,22 +72,15 @@ def create_update_pipeline(
|
||||
api_schema.PostProcessorIn(
|
||||
processor_type=postprocessor[0],
|
||||
reranker_model=api_schema.ModelIn(
|
||||
model_id=rerank_id,
|
||||
# TODO: remove hardcoding
|
||||
model_path="./bge_ov_reranker",
|
||||
device=rerank_device,
|
||||
model_id=rerank_id, model_path="./models/" + rerank_id, device=rerank_device, weight=llm_weights
|
||||
),
|
||||
)
|
||||
],
|
||||
generator=api_schema.GeneratorIn(
|
||||
# TODO: remove hardcoding
|
||||
prompt_path="./edgecraftrag/prompt_template/default_prompt.txt",
|
||||
model=api_schema.ModelIn(
|
||||
model_id=llm_id,
|
||||
# TODO: remove hardcoding
|
||||
model_path="./models/qwen2-7b-instruct/INT4_compressed_weights",
|
||||
device=llm_device,
|
||||
),
|
||||
model=api_schema.ModelIn(model_id=llm_id, model_path=llm_path, device=llm_device, weight=llm_weights),
|
||||
inference_type=llm_infertype,
|
||||
),
|
||||
)
|
||||
# hard code only for test
|
||||
@@ -105,7 +100,7 @@ def activate_pipeline(name):
|
||||
return restext, status
|
||||
|
||||
|
||||
def create_vectordb(docs, spliter, vector_db):
|
||||
def create_vectordb(docs, spliter):
|
||||
req_dict = api_schema.FilesIn(local_paths=docs)
|
||||
res = requests.post(f"{server_addr}/v1/data/files", json=req_dict.dict(), proxies={"http": None})
|
||||
return res.text
|
||||
@@ -116,6 +111,8 @@ def get_files():
|
||||
files = []
|
||||
for file in res.json():
|
||||
files.append((file["file_name"], file["file_id"]))
|
||||
if not files:
|
||||
files.append((None, None))
|
||||
return files
|
||||
|
||||
|
||||
|
||||
@@ -2,11 +2,9 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import cpuinfo
|
||||
import distro # if running Python 3.8 or above
|
||||
@@ -17,41 +15,22 @@ import httpx
|
||||
# Creation of the ModelLoader instance and loading models remain the same
|
||||
import platform_config as pconf
|
||||
import psutil
|
||||
import requests
|
||||
from loguru import logger
|
||||
from omegaconf import OmegaConf
|
||||
from platform_config import get_available_devices, get_available_weights, get_local_available_models
|
||||
from platform_config import (
|
||||
get_avail_llm_inference_type,
|
||||
get_available_devices,
|
||||
get_available_weights,
|
||||
get_local_available_models,
|
||||
)
|
||||
|
||||
pipeline_df = []
|
||||
|
||||
import os
|
||||
|
||||
MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "127.0.0.1")
|
||||
MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 16011))
|
||||
UI_SERVICE_HOST_IP = os.getenv("UI_SERVICE_HOST_IP", "0.0.0.0")
|
||||
UI_SERVICE_PORT = int(os.getenv("UI_SERVICE_PORT", 8084))
|
||||
|
||||
|
||||
def get_llm_model_dir(llm_model_id, weights_compression):
|
||||
model_dirs = {
|
||||
"fp16_model_dir": Path(llm_model_id) / "FP16",
|
||||
"int8_model_dir": Path(llm_model_id) / "INT8_compressed_weights",
|
||||
"int4_model_dir": Path(llm_model_id) / "INT4_compressed_weights",
|
||||
}
|
||||
|
||||
if weights_compression == "INT4":
|
||||
model_dir = model_dirs["int4_model_dir"]
|
||||
elif weights_compression == "INT8":
|
||||
model_dir = model_dirs["int8_model_dir"]
|
||||
else:
|
||||
model_dir = model_dirs["fp16_model_dir"]
|
||||
|
||||
if not model_dir.exists():
|
||||
raise FileNotFoundError(f"The model directory {model_dir} does not exist.")
|
||||
elif not model_dir.is_dir():
|
||||
raise NotADirectoryError(f"The path {model_dir} is not a directory.")
|
||||
|
||||
return model_dir
|
||||
UI_SERVICE_PORT = int(os.getenv("UI_SERVICE_PORT", 8082))
|
||||
|
||||
|
||||
def get_system_status():
|
||||
@@ -87,31 +66,7 @@ def get_system_status():
|
||||
return status
|
||||
|
||||
|
||||
def build_demo(cfg, args):
|
||||
|
||||
def load_chatbot_models(
|
||||
llm_id,
|
||||
llm_device,
|
||||
llm_weights,
|
||||
embedding_id,
|
||||
embedding_device,
|
||||
rerank_id,
|
||||
rerank_device,
|
||||
):
|
||||
req_dict = {
|
||||
"llm_id": llm_id,
|
||||
"llm_device": llm_device,
|
||||
"llm_weights": llm_weights,
|
||||
"embedding_id": embedding_id,
|
||||
"embedding_device": embedding_device,
|
||||
"rerank_id": rerank_id,
|
||||
"rerank_device": rerank_device,
|
||||
}
|
||||
# hard code only for test
|
||||
worker_addr = "http://127.0.0.1:8084"
|
||||
print(req_dict)
|
||||
result = requests.post(f"{worker_addr}/load", json=req_dict, proxies={"http": None})
|
||||
return result.text
|
||||
def build_app(cfg, args):
|
||||
|
||||
def user(message, history):
|
||||
"""Callback function for updating user messages in interface on submit button click.
|
||||
@@ -131,11 +86,9 @@ def build_demo(cfg, args):
|
||||
top_p,
|
||||
top_k,
|
||||
repetition_penalty,
|
||||
max_tokens,
|
||||
hide_full_prompt,
|
||||
do_rag,
|
||||
docs,
|
||||
spliter_name,
|
||||
vector_db,
|
||||
chunk_size,
|
||||
chunk_overlap,
|
||||
vector_search_top_k,
|
||||
@@ -155,41 +108,16 @@ def build_demo(cfg, args):
|
||||
repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text.
|
||||
conversation_id: unique conversation identifier.
|
||||
"""
|
||||
# req_dict = {
|
||||
# "history": history,
|
||||
# "temperature": temperature,
|
||||
# "top_p": top_p,
|
||||
# "top_k": top_k,
|
||||
# "repetition_penalty": repetition_penalty,
|
||||
# "hide_full_prompt": hide_full_prompt,
|
||||
# "do_rag": do_rag,
|
||||
# "docs": docs,
|
||||
# "spliter_name": spliter_name,
|
||||
# "vector_db": vector_db,
|
||||
# "chunk_size": chunk_size,
|
||||
# "chunk_overlap": chunk_overlap,
|
||||
# "vector_search_top_k": vector_search_top_k,
|
||||
# "vector_search_top_n": vector_search_top_n,
|
||||
# "run_rerank": run_rerank,
|
||||
# "search_method": search_method,
|
||||
# "score_threshold": score_threshold,
|
||||
# "streaming": True
|
||||
# }
|
||||
print(history)
|
||||
new_req = {"messages": history[-1][0]}
|
||||
stream_opt = True
|
||||
new_req = {"messages": history[-1][0], "stream": stream_opt, "max_tokens": max_tokens}
|
||||
server_addr = f"http://{MEGA_SERVICE_HOST_IP}:{MEGA_SERVICE_PORT}"
|
||||
|
||||
# Async for streaming response
|
||||
partial_text = ""
|
||||
async with httpx.AsyncClient() as client:
|
||||
async with client.stream("POST", f"{server_addr}/v1/chatqna", json=new_req, timeout=None) as response:
|
||||
partial_text = ""
|
||||
async for chunk in response.aiter_lines():
|
||||
new_text = chunk
|
||||
if new_text.startswith("data"):
|
||||
new_text = re.sub(r"\r\n", "", chunk.split("data: ")[-1])
|
||||
new_text = json.loads(chunk)["choices"][0]["message"]["content"]
|
||||
partial_text = partial_text + new_text
|
||||
async for chunk in response.aiter_text():
|
||||
partial_text = partial_text + chunk
|
||||
history[-1][1] = partial_text
|
||||
yield history
|
||||
|
||||
@@ -198,6 +126,7 @@ def build_demo(cfg, args):
|
||||
avail_rerank_models = get_local_available_models("rerank")
|
||||
avail_devices = get_available_devices()
|
||||
avail_weights_compression = get_available_weights()
|
||||
avail_llm_inference_type = get_avail_llm_inference_type()
|
||||
avail_node_parsers = pconf.get_available_node_parsers()
|
||||
avail_indexers = pconf.get_available_indexers()
|
||||
avail_retrievers = pconf.get_available_retrievers()
|
||||
@@ -212,7 +141,7 @@ def build_demo(cfg, args):
|
||||
.disclaimer {font-variant-caps: all-small-caps}
|
||||
"""
|
||||
|
||||
with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
|
||||
with gr.Blocks(theme=gr.themes.Soft(), css=css) as app:
|
||||
gr.HTML(
|
||||
"""
|
||||
<!DOCTYPE html>
|
||||
@@ -250,7 +179,7 @@ def build_demo(cfg, args):
|
||||
<!-- Title container centered in the remaining space -->
|
||||
<div class="title-container">
|
||||
<span class="title-line"><h1 >Edge Craft RAG based Q&A Chatbot</h1></span>
|
||||
<span class="title-line"><h5 style="margin: 0;">Powered by Intel NEXC Edge AI solutions</h5></span>
|
||||
<span class="title-line"><h5 style="margin: 0;">Powered by Intel</h5></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -295,7 +224,6 @@ def build_demo(cfg, args):
|
||||
with gr.Row():
|
||||
rag_create_pipeline = gr.Button("Create Pipeline")
|
||||
rag_activate_pipeline = gr.Button("Activate Pipeline")
|
||||
rag_remove_pipeline = gr.Button("Remove Pipeline")
|
||||
|
||||
with gr.Column(variant="panel"):
|
||||
u_pipeline_name = gr.Textbox(
|
||||
@@ -366,6 +294,7 @@ def build_demo(cfg, args):
|
||||
label="Embedding run device",
|
||||
# info="Run embedding model on which device?",
|
||||
multiselect=False,
|
||||
interactive=True,
|
||||
)
|
||||
|
||||
with gr.Column(variant="panel"):
|
||||
@@ -415,6 +344,7 @@ def build_demo(cfg, args):
|
||||
label="Rerank run device",
|
||||
# info="Run rerank model on which device?",
|
||||
multiselect=False,
|
||||
interactive=True,
|
||||
)
|
||||
|
||||
with gr.Column(variant="panel"):
|
||||
@@ -428,6 +358,10 @@ def build_demo(cfg, args):
|
||||
interactive=True,
|
||||
)
|
||||
|
||||
u_llm_infertype = gr.Radio(
|
||||
choices=avail_llm_inference_type, label="LLM Inference Type", value="local"
|
||||
)
|
||||
|
||||
with gr.Accordion("LLM Configuration", open=True):
|
||||
u_llm_model_id = gr.Dropdown(
|
||||
choices=avail_llms,
|
||||
@@ -444,12 +378,15 @@ def build_demo(cfg, args):
|
||||
label="LLM run device",
|
||||
# info="Run LLM on which device?",
|
||||
multiselect=False,
|
||||
interactive=True,
|
||||
)
|
||||
|
||||
u_llm_weights = gr.Radio(
|
||||
avail_weights_compression,
|
||||
label="Weights",
|
||||
info="weights compression",
|
||||
value=cfg.llm_weights,
|
||||
interactive=True,
|
||||
)
|
||||
|
||||
# -------------------
|
||||
@@ -460,14 +397,9 @@ def build_demo(cfg, args):
|
||||
# get selected pipeline id
|
||||
# Dataframe: {'headers': '', 'data': [[x00, x01], [x10, x11]}
|
||||
# SelectData.index: [i, j]
|
||||
print(u_pipelines.value["data"])
|
||||
print(evt.index)
|
||||
# always use pipeline id for indexing
|
||||
selected_id = pipeline_df[evt.index[0]][0]
|
||||
pl = cli.get_pipeline(selected_id)
|
||||
# TODO: change to json fomart
|
||||
# pl["postprocessor"][0]["processor_type"]
|
||||
# pl["postprocessor"]["model"]["model_id"], pl["postprocessor"]["model"]["device"]
|
||||
return (
|
||||
pl["name"],
|
||||
pl["status"]["active"],
|
||||
@@ -477,12 +409,16 @@ def build_demo(cfg, args):
|
||||
pl["indexer"]["indexer_type"],
|
||||
pl["retriever"]["retriever_type"],
|
||||
pl["retriever"]["retrieve_topk"],
|
||||
pl["postprocessor"][0]["postprocessor_type"],
|
||||
pl["generator"]["generator_type"],
|
||||
pl["generator"]["inference_type"],
|
||||
pl["generator"]["model"]["model_id"],
|
||||
pl["generator"]["model"]["device"],
|
||||
"",
|
||||
pl["generator"]["model"]["weight"],
|
||||
pl["indexer"]["model"]["model_id"],
|
||||
pl["indexer"]["model"]["device"],
|
||||
pl["postprocessor"][0]["model"]["model_id"] if pl["postprocessor"][0]["model"] is not None else "",
|
||||
pl["postprocessor"][0]["model"]["device"] if pl["postprocessor"][0]["model"] is not None else "",
|
||||
)
|
||||
|
||||
def modify_create_pipeline_button():
|
||||
@@ -502,6 +438,7 @@ def build_demo(cfg, args):
|
||||
vector_search_top_k,
|
||||
postprocessor,
|
||||
generator,
|
||||
llm_infertype,
|
||||
llm_id,
|
||||
llm_device,
|
||||
llm_weights,
|
||||
@@ -521,6 +458,7 @@ def build_demo(cfg, args):
|
||||
vector_search_top_k,
|
||||
postprocessor,
|
||||
generator,
|
||||
llm_infertype,
|
||||
llm_id,
|
||||
llm_device,
|
||||
llm_weights,
|
||||
@@ -548,17 +486,18 @@ def build_demo(cfg, args):
|
||||
u_retriever,
|
||||
u_vector_search_top_k,
|
||||
# postprocessor
|
||||
# u_postprocessor,
|
||||
u_postprocessor,
|
||||
# generator
|
||||
u_generator,
|
||||
u_llm_infertype,
|
||||
# models
|
||||
u_llm_model_id,
|
||||
u_llm_device,
|
||||
u_llm_weights,
|
||||
u_embed_model_id,
|
||||
u_embed_device,
|
||||
# u_rerank_model_id,
|
||||
# u_rerank_device
|
||||
u_rerank_model_id,
|
||||
u_rerank_device,
|
||||
],
|
||||
)
|
||||
|
||||
@@ -586,6 +525,7 @@ def build_demo(cfg, args):
|
||||
u_llm_model_id.input,
|
||||
u_llm_device.input,
|
||||
u_llm_weights.input,
|
||||
u_llm_infertype.input,
|
||||
u_embed_model_id.input,
|
||||
u_embed_device.input,
|
||||
u_rerank_model_id.input,
|
||||
@@ -609,6 +549,7 @@ def build_demo(cfg, args):
|
||||
u_vector_search_top_k,
|
||||
u_postprocessor,
|
||||
u_generator,
|
||||
u_llm_infertype,
|
||||
u_llm_model_id,
|
||||
u_llm_device,
|
||||
u_llm_weights,
|
||||
@@ -634,8 +575,8 @@ def build_demo(cfg, args):
|
||||
def get_files():
|
||||
return cli.get_files()
|
||||
|
||||
def create_vectordb(docs, spliter, vector_db):
|
||||
res = cli.create_vectordb(docs, spliter, vector_db)
|
||||
def create_vectordb(docs, spliter):
|
||||
res = cli.create_vectordb(docs, spliter)
|
||||
return gr.update(value=get_files()), res
|
||||
|
||||
global u_files_selected_row
|
||||
@@ -696,13 +637,6 @@ def build_demo(cfg, args):
|
||||
multiselect=False,
|
||||
)
|
||||
|
||||
vector_db = gr.Dropdown(
|
||||
["FAISS", "Chroma"],
|
||||
value=cfg.vector_db,
|
||||
label="Vector Stores",
|
||||
info="Stores embedded data and performs vector search.",
|
||||
multiselect=False,
|
||||
)
|
||||
load_docs = gr.Button("Upload files")
|
||||
|
||||
u_files_status = gr.Textbox(label="File Processing Status", value="", interactive=False)
|
||||
@@ -723,12 +657,6 @@ def build_demo(cfg, args):
|
||||
with gr.Column():
|
||||
deselect_button = gr.Button("Clear Selection")
|
||||
|
||||
do_rag = gr.Checkbox(
|
||||
value=True,
|
||||
label="RAG is ON",
|
||||
interactive=True,
|
||||
info="Whether to do RAG for generation",
|
||||
)
|
||||
with gr.Accordion("Generation Configuration", open=False):
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
@@ -778,6 +706,17 @@ def build_demo(cfg, args):
|
||||
interactive=True,
|
||||
info="Penalize repetition — 1.0 to disable.",
|
||||
)
|
||||
with gr.Column():
|
||||
with gr.Row():
|
||||
u_max_tokens = gr.Slider(
|
||||
label="Max Token Number",
|
||||
value=512,
|
||||
minimum=1,
|
||||
maximum=8192,
|
||||
step=10,
|
||||
interactive=True,
|
||||
info="Set Max Output Token",
|
||||
)
|
||||
with gr.Column(scale=4):
|
||||
chatbot = gr.Chatbot(
|
||||
height=600,
|
||||
@@ -795,7 +734,6 @@ def build_demo(cfg, args):
|
||||
with gr.Column():
|
||||
with gr.Row():
|
||||
submit = gr.Button("Submit")
|
||||
stop = gr.Button("Stop")
|
||||
clear = gr.Button("Clear")
|
||||
retriever_argument = gr.Accordion("Retriever Configuration", open=True)
|
||||
with retriever_argument:
|
||||
@@ -845,7 +783,6 @@ def build_demo(cfg, args):
|
||||
inputs=[
|
||||
docs,
|
||||
spliter,
|
||||
vector_db,
|
||||
],
|
||||
outputs=[u_files, u_files_status],
|
||||
queue=True,
|
||||
@@ -873,11 +810,9 @@ def build_demo(cfg, args):
|
||||
top_p,
|
||||
top_k,
|
||||
repetition_penalty,
|
||||
u_max_tokens,
|
||||
hide_context,
|
||||
do_rag,
|
||||
docs,
|
||||
spliter,
|
||||
vector_db,
|
||||
u_chunk_size,
|
||||
u_chunk_overlap,
|
||||
u_vector_search_top_k,
|
||||
@@ -897,11 +832,9 @@ def build_demo(cfg, args):
|
||||
top_p,
|
||||
top_k,
|
||||
repetition_penalty,
|
||||
u_max_tokens,
|
||||
hide_context,
|
||||
do_rag,
|
||||
docs,
|
||||
spliter,
|
||||
vector_db,
|
||||
u_chunk_size,
|
||||
u_chunk_overlap,
|
||||
u_vector_search_top_k,
|
||||
@@ -913,15 +846,8 @@ def build_demo(cfg, args):
|
||||
chatbot,
|
||||
queue=True,
|
||||
)
|
||||
# stop.click(
|
||||
# fn=request_cancel,
|
||||
# inputs=None,
|
||||
# outputs=None,
|
||||
# cancels=[submit_event, submit_click_event],
|
||||
# queue=False,
|
||||
# )
|
||||
clear.click(lambda: None, None, chatbot, queue=False)
|
||||
return demo
|
||||
return app
|
||||
|
||||
|
||||
def main():
|
||||
@@ -929,8 +855,6 @@ def main():
|
||||
parser = argparse.ArgumentParser(description="Load Embedding and LLM Models with OpenVino.")
|
||||
# Add the arguments
|
||||
parser.add_argument("--prompt_template", type=str, required=False, help="User specific template")
|
||||
# parser.add_argument("--server_name", type=str, default="0.0.0.0")
|
||||
# parser.add_argument("--server_port", type=int, default=8082)
|
||||
parser.add_argument("--config", type=str, default="./default.yaml", help="configuration file path")
|
||||
parser.add_argument("--share", action="store_true", help="share model")
|
||||
parser.add_argument("--debug", action="store_true", help="enable debugging")
|
||||
@@ -942,20 +866,20 @@ def main():
|
||||
init_cfg_(cfg)
|
||||
logger.info(cfg)
|
||||
|
||||
demo = build_demo(cfg, args)
|
||||
app = build_app(cfg, args)
|
||||
# if you are launching remotely, specify server_name and server_port
|
||||
# demo.launch(server_name='your server name', server_port='server port in int')
|
||||
# app.launch(server_name='your server name', server_port='server port in int')
|
||||
# if you have any issue to launch on your platform, you can pass share=True to launch method:
|
||||
# demo.launch(share=True)
|
||||
# app.launch(share=True)
|
||||
# it creates a publicly shareable link for the interface. Read more in the docs: https://gradio.app/docs/
|
||||
# demo.launch(share=True)
|
||||
demo.queue().launch(
|
||||
# app.launch(share=True)
|
||||
app.queue().launch(
|
||||
server_name=UI_SERVICE_HOST_IP, server_port=UI_SERVICE_PORT, share=args.share, allowed_paths=["."]
|
||||
)
|
||||
|
||||
# %%
|
||||
# please run this cell for stopping gradio interface
|
||||
demo.close()
|
||||
app.close()
|
||||
|
||||
|
||||
def init_cfg_(cfg):
|
||||
@@ -969,14 +893,14 @@ def init_cfg_(cfg):
|
||||
cfg.llm_device = "CPU"
|
||||
if "model_language" not in cfg:
|
||||
cfg.model_language = "Chinese"
|
||||
if "vector_db" not in cfg:
|
||||
cfg.vector_db = "FAISS"
|
||||
if "splitter_name" not in cfg:
|
||||
cfg.splitter_name = "RecursiveCharacter" # or "Chinese"
|
||||
if "search_method" not in cfg:
|
||||
cfg.search_method = "similarity"
|
||||
if "score_threshold" not in cfg:
|
||||
cfg.score_threshold = 0.5
|
||||
if "llm_weights" not in cfg:
|
||||
cfg.llm_weights = "FP16"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -90,6 +90,11 @@ def get_available_weights():
|
||||
return avail_weights_compression
|
||||
|
||||
|
||||
def get_avail_llm_inference_type():
|
||||
avail_llm_inference_type = ["local", "vllm"]
|
||||
return avail_llm_inference_type
|
||||
|
||||
|
||||
def get_enum_values(c: Enum):
|
||||
return [v.value for k, v in vars(c).items() if not callable(v) and not k.startswith("__") and not k.startswith("_")]
|
||||
|
||||
@@ -112,3 +117,25 @@ def get_available_postprocessors():
|
||||
|
||||
def get_available_generators():
|
||||
return get_enum_values(GeneratorType)
|
||||
|
||||
|
||||
def get_llm_model_dir(prefix, llm_model_id, weights_compression):
|
||||
model_dirs = {
|
||||
"fp16_model_dir": prefix + llm_model_id + "/FP16",
|
||||
"int8_model_dir": prefix + llm_model_id + "/INT8_compressed_weights",
|
||||
"int4_model_dir": prefix + llm_model_id + "/INT4_compressed_weights",
|
||||
}
|
||||
|
||||
if weights_compression == "INT4":
|
||||
model_dir = model_dirs["int4_model_dir"]
|
||||
elif weights_compression == "INT8":
|
||||
model_dir = model_dirs["int8_model_dir"]
|
||||
else:
|
||||
model_dir = model_dirs["fp16_model_dir"]
|
||||
|
||||
# if not model_dir.exists():
|
||||
# raise FileNotFoundError(f"The model directory {model_dir} does not exist.")
|
||||
# elif not model_dir.is_dir():
|
||||
# raise NotADirectoryError(f"The path {model_dir} is not a directory.")
|
||||
|
||||
return model_dir
|
||||
|
||||
8
EdgeCraftRAG/ui/gradio/requirements.txt
Normal file
8
EdgeCraftRAG/ui/gradio/requirements.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
distro>=1.9.0
|
||||
gradio>=4.44.1
|
||||
loguru>=0.7.2
|
||||
omegaconf>=2.3.0
|
||||
openvino>=2024.4.0
|
||||
psutil>=6.1.0
|
||||
py-cpuinfo>=9.0.0
|
||||
uvicorn>=0.30.6
|
||||
Reference in New Issue
Block a user