Added QNA chat using Qdrant (#100)
Signed-off-by: Anush008 <anushshetty90@gmail.com> Co-authored-by: lvliang-intel <liang1.lv@intel.com>
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,3 +1,5 @@
|
|||||||
**/node_modules
|
**/node_modules
|
||||||
**/.svelte-kit
|
**/.svelte-kit
|
||||||
**/package-lock.json
|
**/package-lock.json
|
||||||
|
|
||||||
|
__pycache__/
|
||||||
@@ -113,25 +113,31 @@ curl 127.0.0.1:9090/embed \
|
|||||||
|
|
||||||
Note: If you want to integrate the TEI service into the LangChain application, you'll need to restart the LangChain backend service after launching the TEI service.
|
Note: If you want to integrate the TEI service into the LangChain application, you'll need to restart the LangChain backend service after launching the TEI service.
|
||||||
|
|
||||||
## Launch Redis and LangChain Backend Service
|
## Launch Vector Database and LangChain Backend Service
|
||||||
|
|
||||||
Update the `HUGGINGFACEHUB_API_TOKEN` environment variable with your huggingface token in the `docker-compose.yml`
|
Update the `HUGGINGFACEHUB_API_TOKEN` environment variable with your huggingface token in the `docker-compose.yml`
|
||||||
|
|
||||||
|
By default, Redis is used as the vector store. To use Qdrant, use the `docker-compose-qdrant.yml` file instead.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd langchain/docker
|
cd langchain/docker
|
||||||
docker compose -f docker-compose.yml up -d
|
docker compose -f docker-compose.yml up -d
|
||||||
|
# To use Qdrant, run
|
||||||
|
# docker compose -f docker-compose-qdrant.yml up -d
|
||||||
cd ../../
|
cd ../../
|
||||||
```
|
```
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> If you modified any files and want that change introduced in this step, add `--build` to the end of the command to build the container image instead of pulling it from dockerhub.
|
> If you modified any files and want that change introduced in this step, add `--build` to the end of the command to build the container image instead of pulling it from dockerhub.
|
||||||
|
|
||||||
## Ingest data into Redis
|
## Ingest Data Into Vector Database
|
||||||
|
|
||||||
Each time the Redis container is launched, data should be ingested into the container using the commands:
|
Each time the vector database container is launched, data should be ingested into the container using the commands:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker exec -it qna-rag-redis-server bash
|
docker exec -it qna-rag-redis-server bash
|
||||||
|
# To use Qdrant, run
|
||||||
|
# docker exec -it qna-rag-qdrant-server bash
|
||||||
cd /ws
|
cd /ws
|
||||||
python ingest.py
|
python ingest.py
|
||||||
```
|
```
|
||||||
|
|||||||
45
ChatQnA/langchain/docker/docker-compose-qdrant.yml
Normal file
45
ChatQnA/langchain/docker/docker-compose-qdrant.yml
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
# Copyright (c) 2024 Intel Corporation
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
services:
|
||||||
|
qdrant-vector-db:
|
||||||
|
image: qdrant/qdrant:v1.9.0
|
||||||
|
container_name: qdrant-vector-db
|
||||||
|
ports:
|
||||||
|
- "6333:6333"
|
||||||
|
- "6334:6334"
|
||||||
|
qna-rag-qdrant-server:
|
||||||
|
build:
|
||||||
|
args:
|
||||||
|
https_proxy: ${https_proxy}
|
||||||
|
http_proxy: ${http_proxy}
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
context: .
|
||||||
|
image: intel/gen-ai-examples:qna-rag-qdrant-server
|
||||||
|
container_name: qna-rag-qdrant-server
|
||||||
|
environment:
|
||||||
|
- https_proxy=${https_proxy}
|
||||||
|
- HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
|
||||||
|
- "EMBED_MODEL=BAAI/bge-base-en-v1.5"
|
||||||
|
- "VECTOR_DATABASE=QDRANT"
|
||||||
|
- "TGI_LLM_ENDPOINT=http://localhost:8080"
|
||||||
|
# "TEI_ENDPOINT="http://xxx.xxx.xxx.xxx:9090" - To use a custom TEI endpoint
|
||||||
|
ulimits:
|
||||||
|
memlock:
|
||||||
|
soft: -1 # Set memlock to unlimited (no soft or hard limit)
|
||||||
|
hard: -1
|
||||||
|
volumes:
|
||||||
|
- ../qdrant:/ws
|
||||||
|
- ../test:/test
|
||||||
|
network_mode: "host"
|
||||||
@@ -43,6 +43,7 @@ services:
|
|||||||
- "REDIS_PORT=6379"
|
- "REDIS_PORT=6379"
|
||||||
- "EMBED_MODEL=BAAI/bge-base-en-v1.5"
|
- "EMBED_MODEL=BAAI/bge-base-en-v1.5"
|
||||||
- "REDIS_SCHEMA=schema_dim_768.yml"
|
- "REDIS_SCHEMA=schema_dim_768.yml"
|
||||||
|
- "VECTOR_DATABASE=REDIS"
|
||||||
ulimits:
|
ulimits:
|
||||||
memlock:
|
memlock:
|
||||||
soft: -1 # Set memlock to unlimited (no soft or hard limit)
|
soft: -1 # Set memlock to unlimited (no soft or hard limit)
|
||||||
|
|||||||
@@ -23,15 +23,14 @@ from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse
|
|||||||
from guardrails import moderation_prompt_for_chat, unsafe_dict
|
from guardrails import moderation_prompt_for_chat, unsafe_dict
|
||||||
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings
|
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings
|
||||||
from langchain_community.llms import HuggingFaceEndpoint
|
from langchain_community.llms import HuggingFaceEndpoint
|
||||||
from langchain_community.vectorstores import Redis
|
|
||||||
from langchain_core.messages import HumanMessage
|
from langchain_core.messages import HumanMessage
|
||||||
from langchain_core.output_parsers import StrOutputParser
|
from langchain_core.output_parsers import StrOutputParser
|
||||||
from langchain_core.runnables import RunnablePassthrough
|
from langchain_core.runnables import RunnablePassthrough
|
||||||
from langserve import add_routes
|
from langserve import add_routes
|
||||||
from prompts import contextualize_q_prompt, prompt, qa_prompt
|
from prompts import contextualize_q_prompt, prompt, qa_prompt
|
||||||
from rag_redis.config import EMBED_MODEL, INDEX_NAME, INDEX_SCHEMA, REDIS_URL
|
|
||||||
from starlette.middleware.cors import CORSMiddleware
|
from starlette.middleware.cors import CORSMiddleware
|
||||||
from utils import (
|
from utils import (
|
||||||
|
VECTOR_DATABASE,
|
||||||
create_kb_folder,
|
create_kb_folder,
|
||||||
create_retriever_from_files,
|
create_retriever_from_files,
|
||||||
create_retriever_from_links,
|
create_retriever_from_links,
|
||||||
@@ -40,6 +39,11 @@ from utils import (
|
|||||||
reload_retriever,
|
reload_retriever,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if VECTOR_DATABASE == "REDIS":
|
||||||
|
from rag_redis.config import INDEX_NAME
|
||||||
|
elif VECTOR_DATABASE == "QDRANT":
|
||||||
|
from rag_qdrant.config import COLLECTION_NAME as INDEX_NAME
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Server Configuration")
|
parser = argparse.ArgumentParser(description="Server Configuration")
|
||||||
parser.add_argument("--chathistory", action="store_true", help="Enable debug mode")
|
parser.add_argument("--chathistory", action="store_true", help="Enable debug mode")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@@ -52,7 +56,6 @@ app.add_middleware(
|
|||||||
|
|
||||||
|
|
||||||
class RAGAPIRouter(APIRouter):
|
class RAGAPIRouter(APIRouter):
|
||||||
|
|
||||||
def __init__(self, upload_dir, entrypoint, safety_guard_endpoint, tei_endpoint=None) -> None:
|
def __init__(self, upload_dir, entrypoint, safety_guard_endpoint, tei_endpoint=None) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.upload_dir = upload_dir
|
self.upload_dir = upload_dir
|
||||||
@@ -93,15 +96,31 @@ class RAGAPIRouter(APIRouter):
|
|||||||
self.embeddings = HuggingFaceHubEmbeddings(model=tei_endpoint)
|
self.embeddings = HuggingFaceHubEmbeddings(model=tei_endpoint)
|
||||||
else:
|
else:
|
||||||
# create embeddings using local embedding model
|
# create embeddings using local embedding model
|
||||||
|
EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
||||||
self.embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)
|
self.embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)
|
||||||
|
|
||||||
rds = Redis.from_existing_index(
|
if VECTOR_DATABASE == "REDIS":
|
||||||
self.embeddings,
|
from langchain_community.vectorstores import Redis
|
||||||
index_name=INDEX_NAME,
|
from rag_redis.config import INDEX_SCHEMA, REDIS_URL
|
||||||
redis_url=REDIS_URL,
|
|
||||||
schema=INDEX_SCHEMA,
|
vdb = Redis.from_existing_index(
|
||||||
)
|
self.embeddings,
|
||||||
retriever = rds.as_retriever(search_type="mmr")
|
index_name=INDEX_NAME,
|
||||||
|
redis_url=REDIS_URL,
|
||||||
|
schema=INDEX_SCHEMA,
|
||||||
|
)
|
||||||
|
elif VECTOR_DATABASE == "QDRANT":
|
||||||
|
from langchain_community.vectorstores import Qdrant
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from rag_qdrant.config import QDRANT_HOST, QDRANT_PORT
|
||||||
|
|
||||||
|
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
|
||||||
|
vdb = Qdrant(
|
||||||
|
embeddings=self.embeddings,
|
||||||
|
collection_name=INDEX_NAME,
|
||||||
|
client=client,
|
||||||
|
)
|
||||||
|
retriever = vdb.as_retriever(search_type="mmr")
|
||||||
|
|
||||||
# Define contextualize chain
|
# Define contextualize chain
|
||||||
self.contextualize_q_chain = contextualize_q_prompt | self.llm | StrOutputParser()
|
self.contextualize_q_chain = contextualize_q_prompt | self.llm | StrOutputParser()
|
||||||
|
|||||||
@@ -28,9 +28,13 @@ import requests
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
from langchain_community.document_loaders import UnstructuredFileLoader
|
from langchain_community.document_loaders import UnstructuredFileLoader
|
||||||
from langchain_community.vectorstores import Redis
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from rag_redis.config import INDEX_SCHEMA, REDIS_URL
|
|
||||||
|
SUPPORTED_VECTOR_DATABASES = ["REDIS", "QDRANT"]
|
||||||
|
|
||||||
|
VECTOR_DATABASE = str(os.getenv("VECTOR_DATABASE", "redis")).upper()
|
||||||
|
|
||||||
|
assert VECTOR_DATABASE in SUPPORTED_VECTOR_DATABASES, f"Invalid VECTOR_DATABASE: {VECTOR_DATABASE}"
|
||||||
|
|
||||||
|
|
||||||
def get_current_beijing_time():
|
def get_current_beijing_time():
|
||||||
@@ -57,7 +61,6 @@ def create_kb_folder(upload_dir):
|
|||||||
|
|
||||||
|
|
||||||
class Crawler:
|
class Crawler:
|
||||||
|
|
||||||
def __init__(self, pool=None):
|
def __init__(self, pool=None):
|
||||||
if pool:
|
if pool:
|
||||||
assert isinstance(pool, (str, list, tuple)), "url pool should be str, list or tuple"
|
assert isinstance(pool, (str, list, tuple)), "url pool should be str, list or tuple"
|
||||||
@@ -292,16 +295,33 @@ def create_retriever_from_files(doc, embeddings, index_name: str):
|
|||||||
loader = UnstructuredFileLoader(doc, mode="single", strategy="fast")
|
loader = UnstructuredFileLoader(doc, mode="single", strategy="fast")
|
||||||
chunks = loader.load_and_split(text_splitter)
|
chunks = loader.load_and_split(text_splitter)
|
||||||
|
|
||||||
rds = Redis.from_texts(
|
if VECTOR_DATABASE == "REDIS":
|
||||||
texts=[chunk.page_content for chunk in chunks],
|
from langchain_community.vectorstores import Redis
|
||||||
metadatas=[chunk.metadata for chunk in chunks],
|
from rag_redis.config import INDEX_SCHEMA, REDIS_URL
|
||||||
embedding=embeddings,
|
|
||||||
index_name=index_name,
|
|
||||||
redis_url=REDIS_URL,
|
|
||||||
index_schema=INDEX_SCHEMA,
|
|
||||||
)
|
|
||||||
|
|
||||||
retriever = rds.as_retriever(search_type="mmr")
|
vdb = Redis.from_texts(
|
||||||
|
texts=[chunk.page_content for chunk in chunks],
|
||||||
|
metadatas=[chunk.metadata for chunk in chunks],
|
||||||
|
embedding=embeddings,
|
||||||
|
index_name=index_name,
|
||||||
|
redis_url=REDIS_URL,
|
||||||
|
index_schema=INDEX_SCHEMA,
|
||||||
|
)
|
||||||
|
|
||||||
|
elif VECTOR_DATABASE == "QDRANT":
|
||||||
|
from langchain_community.vectorstores import Qdrant
|
||||||
|
from rag_qdrant.config import COLLECTION_NAME, QDRANT_HOST, QDRANT_PORT
|
||||||
|
|
||||||
|
vdb = Qdrant.from_texts(
|
||||||
|
texts=[chunk.page_content for chunk in chunks],
|
||||||
|
metadatas=[chunk.metadata for chunk in chunks],
|
||||||
|
embedding=embeddings,
|
||||||
|
collection_name=COLLECTION_NAME,
|
||||||
|
host=QDRANT_HOST,
|
||||||
|
port=QDRANT_PORT,
|
||||||
|
)
|
||||||
|
|
||||||
|
retriever = vdb.as_retriever(search_type="mmr")
|
||||||
return retriever
|
return retriever
|
||||||
|
|
||||||
|
|
||||||
@@ -315,29 +335,63 @@ def create_retriever_from_links(embeddings, link_list: list, index_name):
|
|||||||
texts.append(data)
|
texts.append(data)
|
||||||
metadatas.append(metadata)
|
metadatas.append(metadata)
|
||||||
|
|
||||||
rds = Redis.from_texts(
|
if VECTOR_DATABASE == "REDIS":
|
||||||
texts=texts,
|
from langchain_community.vectorstores import Redis
|
||||||
metadatas=metadatas,
|
from rag_redis.config import INDEX_SCHEMA, REDIS_URL
|
||||||
embedding=embeddings,
|
|
||||||
index_name=index_name,
|
|
||||||
redis_url=REDIS_URL,
|
|
||||||
index_schema=INDEX_SCHEMA,
|
|
||||||
)
|
|
||||||
|
|
||||||
retriever = rds.as_retriever(search_type="mmr")
|
vdb = Redis.from_texts(
|
||||||
|
texts=texts,
|
||||||
|
metadatas=metadatas,
|
||||||
|
embedding=embeddings,
|
||||||
|
index_name=index_name,
|
||||||
|
redis_url=REDIS_URL,
|
||||||
|
index_schema=INDEX_SCHEMA,
|
||||||
|
)
|
||||||
|
|
||||||
|
elif VECTOR_DATABASE == "QDRANT":
|
||||||
|
from langchain_community.vectorstores import Qdrant
|
||||||
|
from rag_qdrant.config import COLLECTION_NAME, QDRANT_HOST, QDRANT_PORT
|
||||||
|
|
||||||
|
vdb = Qdrant.from_texts(
|
||||||
|
texts=texts,
|
||||||
|
metadatas=metadatas,
|
||||||
|
embedding=embeddings,
|
||||||
|
collection_name=COLLECTION_NAME,
|
||||||
|
host=QDRANT_HOST,
|
||||||
|
port=QDRANT_PORT,
|
||||||
|
)
|
||||||
|
|
||||||
|
retriever = vdb.as_retriever(search_type="mmr")
|
||||||
return retriever
|
return retriever
|
||||||
|
|
||||||
|
|
||||||
def reload_retriever(embeddings, index_name):
|
def reload_retriever(embeddings, index_name):
|
||||||
print(f"[rag - reload retriever] reload with index: {index_name}")
|
print(f"[rag - reload retriever] reload with index: {index_name}")
|
||||||
rds = Redis.from_existing_index(
|
|
||||||
embeddings,
|
|
||||||
index_name=index_name,
|
|
||||||
redis_url=REDIS_URL,
|
|
||||||
schema=INDEX_SCHEMA,
|
|
||||||
)
|
|
||||||
|
|
||||||
retriever = rds.as_retriever(search_type="mmr")
|
if VECTOR_DATABASE == "REDIS":
|
||||||
|
from langchain_community.vectorstores import Redis
|
||||||
|
from rag_redis.config import INDEX_SCHEMA, REDIS_URL
|
||||||
|
|
||||||
|
vdb = Redis.from_existing_index(
|
||||||
|
embeddings,
|
||||||
|
index_name=index_name,
|
||||||
|
redis_url=REDIS_URL,
|
||||||
|
schema=INDEX_SCHEMA,
|
||||||
|
)
|
||||||
|
|
||||||
|
elif VECTOR_DATABASE == "QDRANT":
|
||||||
|
from langchain_community.vectorstores import Qdrant
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from rag_qdrant.config import COLLECTION_NAME, QDRANT_HOST, QDRANT_PORT
|
||||||
|
|
||||||
|
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
|
||||||
|
vdb = Qdrant(
|
||||||
|
embeddings=embeddings,
|
||||||
|
collection_name=COLLECTION_NAME,
|
||||||
|
client=client,
|
||||||
|
)
|
||||||
|
|
||||||
|
retriever = vdb.as_retriever(search_type="mmr")
|
||||||
return retriever
|
return retriever
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ poetry
|
|||||||
pyarrow
|
pyarrow
|
||||||
pydantic==1.10.13
|
pydantic==1.10.13
|
||||||
pymupdf
|
pymupdf
|
||||||
|
qdrant-client==1.9.0
|
||||||
redis
|
redis
|
||||||
sentence-transformers
|
sentence-transformers
|
||||||
unstructured
|
unstructured
|
||||||
|
|||||||
21
ChatQnA/langchain/qdrant/LICENSE
Normal file
21
ChatQnA/langchain/qdrant/LICENSE
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2023 LangChain, Inc.
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
BIN
ChatQnA/langchain/qdrant/data/nke-10k-2023.pdf
Normal file
BIN
ChatQnA/langchain/qdrant/data/nke-10k-2023.pdf
Normal file
Binary file not shown.
106
ChatQnA/langchain/qdrant/ingest.py
Normal file
106
ChatQnA/langchain/qdrant/ingest.py
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Copyright (c) 2024 Intel Corporation
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings
|
||||||
|
from langchain_community.vectorstores import Qdrant
|
||||||
|
from PIL import Image
|
||||||
|
from rag_qdrant.config import COLLECTION_NAME, EMBED_MODEL, QDRANT_HOST, QDRANT_PORT, TEI_EMBEDDING_ENDPOINT
|
||||||
|
|
||||||
|
|
||||||
|
def pdf_loader(file_path):
|
||||||
|
try:
|
||||||
|
import easyocr
|
||||||
|
import fitz
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"`PyMuPDF` or 'easyocr' package is not found, please install it with "
|
||||||
|
"`pip install pymupdf or pip install easyocr.`"
|
||||||
|
)
|
||||||
|
|
||||||
|
doc = fitz.open(file_path)
|
||||||
|
reader = easyocr.Reader(["en"])
|
||||||
|
result = ""
|
||||||
|
for i in range(doc.page_count):
|
||||||
|
page = doc.load_page(i)
|
||||||
|
pagetext = page.get_text().strip()
|
||||||
|
if pagetext:
|
||||||
|
result = result + pagetext
|
||||||
|
if len(doc.get_page_images(i)) > 0:
|
||||||
|
for img in doc.get_page_images(i):
|
||||||
|
if img:
|
||||||
|
pageimg = ""
|
||||||
|
xref = img[0]
|
||||||
|
img_data = doc.extract_image(xref)
|
||||||
|
img_bytes = img_data["image"]
|
||||||
|
pil_image = Image.open(io.BytesIO(img_bytes))
|
||||||
|
img = np.array(pil_image)
|
||||||
|
img_result = reader.readtext(img, paragraph=True, detail=0)
|
||||||
|
pageimg = pageimg + ", ".join(img_result).strip()
|
||||||
|
if pageimg.endswith("!") or pageimg.endswith("?") or pageimg.endswith("."):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
pageimg = pageimg + "."
|
||||||
|
result = result + pageimg
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_documents():
|
||||||
|
"""Ingest PDF to Qdrant from the data/ directory that
|
||||||
|
contains Edgar 10k filings data for Nike."""
|
||||||
|
# Load list of pdfs
|
||||||
|
company_name = "Nike"
|
||||||
|
data_path = "data/"
|
||||||
|
doc_path = [os.path.join(data_path, file) for file in os.listdir(data_path)][0]
|
||||||
|
|
||||||
|
print("Parsing 10k filing doc for NIKE", doc_path)
|
||||||
|
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True)
|
||||||
|
content = pdf_loader(doc_path)
|
||||||
|
chunks = text_splitter.split_text(content)
|
||||||
|
|
||||||
|
print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
|
||||||
|
# Create vectorstore
|
||||||
|
if TEI_EMBEDDING_ENDPOINT:
|
||||||
|
# create embeddings using TEI endpoint service
|
||||||
|
embedder = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT)
|
||||||
|
else:
|
||||||
|
# create embeddings using local embedding model
|
||||||
|
embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)
|
||||||
|
|
||||||
|
# Batch size
|
||||||
|
batch_size = 32
|
||||||
|
num_chunks = len(chunks)
|
||||||
|
for i in range(0, num_chunks, batch_size):
|
||||||
|
batch_chunks = chunks[i : i + batch_size]
|
||||||
|
batch_texts = [f"Company: {company_name}. " + chunk for chunk in batch_chunks]
|
||||||
|
|
||||||
|
_ = Qdrant.from_texts(
|
||||||
|
texts=batch_texts,
|
||||||
|
embedding=embedder,
|
||||||
|
collection_name=COLLECTION_NAME,
|
||||||
|
host=QDRANT_HOST,
|
||||||
|
port=QDRANT_PORT,
|
||||||
|
)
|
||||||
|
print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ingest_documents()
|
||||||
94
ChatQnA/langchain/qdrant/rag_qdrant.ipynb
Normal file
94
ChatQnA/langchain/qdrant/rag_qdrant.ipynb
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "fe1adb29",
|
||||||
|
"metadata": {},
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "681a5d1e",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Connect to RAG App\n",
|
||||||
|
"\n",
|
||||||
|
"Assuming you are already running this server:\n",
|
||||||
|
"```bash\n",
|
||||||
|
"langserve start\n",
|
||||||
|
"```"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 37,
|
||||||
|
"id": "d774be2a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Nike's revenue in 2023 was $51.2 billion. \n",
|
||||||
|
"\n",
|
||||||
|
"Source: 'data/nke-10k-2023.pdf', Start Index: '146100'\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langserve.client import RemoteRunnable\n",
|
||||||
|
"\n",
|
||||||
|
"rag_qdrant = RemoteRunnable(\"http://localhost:8000/rag-qdrant\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(rag_qdrant.invoke(\"What was Nike's revenue in 2023?\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 43,
|
||||||
|
"id": "07ae0005",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"As of May 31, 2023, Nike had approximately 83,700 employees worldwide. This information can be found in the first piece of context provided. (source: data/nke-10k-2023.pdf, start_index: 32532)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(rag_qdrant.invoke(\"How many employees work at Nike?\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "4a6b9f00",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
13
ChatQnA/langchain/qdrant/rag_qdrant/__init__.py
Normal file
13
ChatQnA/langchain/qdrant/rag_qdrant/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2024 Intel Corporation
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
80
ChatQnA/langchain/qdrant/rag_qdrant/chain.py
Normal file
80
ChatQnA/langchain/qdrant/rag_qdrant/chain.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Copyright (c) 2024 Intel Corporation
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from langchain_community.embeddings import HuggingFaceEmbeddings
|
||||||
|
from langchain_community.llms import HuggingFaceEndpoint
|
||||||
|
from langchain_community.vectorstores import Qdrant
|
||||||
|
from langchain_core.output_parsers import StrOutputParser
|
||||||
|
from langchain_core.prompts import ChatPromptTemplate
|
||||||
|
from langchain_core.pydantic_v1 import BaseModel
|
||||||
|
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from rag_qdrant.config import COLLECTION_NAME, EMBED_MODEL, QDRANT_HOST, QDRANT_PORT, TGI_LLM_ENDPOINT
|
||||||
|
|
||||||
|
|
||||||
|
# Make this look better in the docs.
|
||||||
|
class Question(BaseModel):
|
||||||
|
__root__: str
|
||||||
|
|
||||||
|
|
||||||
|
# Init Embeddings
|
||||||
|
embedder = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
|
||||||
|
|
||||||
|
# Connect to pre-loaded vectorstore
|
||||||
|
# run the ingest.py script to populate this
|
||||||
|
|
||||||
|
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
|
||||||
|
vectorstore = Qdrant(embeddings=embedder, collection_name=COLLECTION_NAME, client=client)
|
||||||
|
|
||||||
|
# TODO allow user to change parameters
|
||||||
|
retriever = vectorstore.as_retriever(search_type="mmr")
|
||||||
|
|
||||||
|
# Define our prompt
|
||||||
|
template = """
|
||||||
|
Use the following pieces of context from retrieved
|
||||||
|
dataset to answer the question. Do not make up an answer if there is no
|
||||||
|
context provided to help answer it. Include the 'source' and 'start_index'
|
||||||
|
from the metadata included in the context you used to answer the question
|
||||||
|
|
||||||
|
Context:
|
||||||
|
---------
|
||||||
|
{context}
|
||||||
|
|
||||||
|
---------
|
||||||
|
Question: {question}
|
||||||
|
---------
|
||||||
|
|
||||||
|
Answer:
|
||||||
|
"""
|
||||||
|
|
||||||
|
prompt = ChatPromptTemplate.from_template(template)
|
||||||
|
|
||||||
|
# RAG Chain
|
||||||
|
model = HuggingFaceEndpoint(
|
||||||
|
endpoint_url=TGI_LLM_ENDPOINT,
|
||||||
|
max_new_tokens=512,
|
||||||
|
top_k=10,
|
||||||
|
top_p=0.95,
|
||||||
|
typical_p=0.95,
|
||||||
|
temperature=0.01,
|
||||||
|
repetition_penalty=1.03,
|
||||||
|
streaming=True,
|
||||||
|
truncate=1024,
|
||||||
|
)
|
||||||
|
|
||||||
|
chain = (
|
||||||
|
RunnableParallel({"context": retriever, "question": RunnablePassthrough()}) | prompt | model | StrOutputParser()
|
||||||
|
).with_types(input_type=Question)
|
||||||
28
ChatQnA/langchain/qdrant/rag_qdrant/config.py
Normal file
28
ChatQnA/langchain/qdrant/rag_qdrant/config.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
# Copyright (c) 2024 Intel Corporation
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Embedding model
|
||||||
|
EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
||||||
|
|
||||||
|
# Qdrant configuration
|
||||||
|
QDRANT_HOST = os.getenv("QDRANT", "localhost")
|
||||||
|
QDRANT_PORT = int(os.getenv("QDRANT_PORT", 6333))
|
||||||
|
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag-qdrant")
|
||||||
|
|
||||||
|
# LLM/Embedding endpoints
|
||||||
|
TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
|
||||||
|
TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081")
|
||||||
|
TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT")
|
||||||
@@ -66,6 +66,15 @@ All the examples are well-validated on Intel platforms. In addition, these examp
|
|||||||
<td>Gaudi2</td>
|
<td>Gaudi2</td>
|
||||||
<td>Chatbot</td>
|
<td>Chatbot</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><a href="https://www.langchain.com">LangChain</a></td>
|
||||||
|
<td><a href="https://huggingface.co/mistralai/Mistral-7B-v0.1">Mistral-7B</a></td>
|
||||||
|
<td><a href="https://huggingface.co/BAAI/bge-base-en">BGE-Base</a></td>
|
||||||
|
<td><a href="https://qdrant.tech/">Qdrant</a></td>
|
||||||
|
<td><a href="https://github.com/huggingface/tgi-gaudi">TGI-Habana</a></td>
|
||||||
|
<td>Gaudi2</td>
|
||||||
|
<td>Chatbot</td>
|
||||||
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user