* add AudioQnA comps * readme * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * revert to main br * draft tests Signed-off-by: Spycsh <sihan.chen@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * fix * rm old yml * fix name * fix * remove * longer warmup time * Modify the corresponding format according to the backend new structure. Signed-off-by: Yue, Wenjiao <wenjiao.yue@intel.com> * Update .env * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Spycsh <sihan.chen@intel.com> Signed-off-by: Yue, Wenjiao <wenjiao.yue@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: WenjiaoYue <wenjiao.yue@intel.com> Co-authored-by: chen, suyue <suyue.chen@intel.com>
87 lines
2.8 KiB
Python
87 lines
2.8 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
# Copyright (C) 2024 Intel Corporation
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
#
|
|
|
|
import io
|
|
import os
|
|
|
|
import numpy as np
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings
|
|
from langchain_community.vectorstores import Redis
|
|
from PIL import Image
|
|
from rag_redis.config import EMBED_MODEL, INDEX_NAME, INDEX_SCHEMA, REDIS_URL
|
|
|
|
|
|
def pdf_loader(file_path):
|
|
try:
|
|
import easyocr
|
|
import fitz
|
|
except ImportError:
|
|
raise ImportError(
|
|
"`PyMuPDF` or 'easyocr' package is not found, please install it with "
|
|
"`pip install pymupdf or pip install easyocr.`"
|
|
)
|
|
|
|
doc = fitz.open(file_path)
|
|
reader = easyocr.Reader(["en"])
|
|
result = ""
|
|
for i in range(doc.page_count):
|
|
page = doc.load_page(i)
|
|
pagetext = page.get_text().strip()
|
|
if pagetext:
|
|
result = result + pagetext
|
|
if len(doc.get_page_images(i)) > 0:
|
|
for img in doc.get_page_images(i):
|
|
if img:
|
|
pageimg = ""
|
|
xref = img[0]
|
|
img_data = doc.extract_image(xref)
|
|
img_bytes = img_data["image"]
|
|
pil_image = Image.open(io.BytesIO(img_bytes))
|
|
img = np.array(pil_image)
|
|
img_result = reader.readtext(img, paragraph=True, detail=0)
|
|
pageimg = pageimg + ", ".join(img_result).strip()
|
|
if pageimg.endswith("!") or pageimg.endswith("?") or pageimg.endswith("."):
|
|
pass
|
|
else:
|
|
pageimg = pageimg + "."
|
|
result = result + pageimg
|
|
return result
|
|
|
|
|
|
def ingest_documents():
|
|
"""Ingest PDF to Redis from the data/ directory that
|
|
contains Edgar 10k filings data for Nike."""
|
|
# Load list of pdfs
|
|
company_name = "Nike"
|
|
data_path = "data/"
|
|
doc_path = [os.path.join(data_path, file) for file in os.listdir(data_path)][0]
|
|
|
|
print("Parsing 10k filing doc for NIKE", doc_path)
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True)
|
|
content = pdf_loader(doc_path)
|
|
chunks = text_splitter.split_text(content)
|
|
|
|
print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
|
|
# Create vectorstore
|
|
embedder = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
|
|
|
|
_ = Redis.from_texts(
|
|
# appending this little bit can sometimes help with semantic retrieval
|
|
# especially with multiple companies
|
|
texts=[f"Company: {company_name}. " + chunk for chunk in chunks],
|
|
embedding=embedder,
|
|
index_name=INDEX_NAME,
|
|
index_schema=INDEX_SCHEMA,
|
|
redis_url=REDIS_URL,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
ingest_documents()
|