Files
GenAIExamples/ChatQnA/deprecated/langchain/redis/ingest_intel.py
chen, suyue 509d5c66ca unify license copyright (#234)
Signed-off-by: chensuyue <suyue.chen@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-05-31 17:19:30 +08:00

101 lines
3.4 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#
import io
import os
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings
from langchain_community.vectorstores import Redis
from PIL import Image
from rag_redis.config import EMBED_MODEL, INDEX_NAME, INDEX_SCHEMA, REDIS_URL
tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")
def pdf_loader(file_path):
try:
import easyocr
import fitz
except ImportError:
raise ImportError(
"`PyMuPDF` or 'easyocr' package is not found, please install it with "
"`pip install pymupdf or pip install easyocr.`"
)
doc = fitz.open(file_path)
reader = easyocr.Reader(["en"])
result = ""
for i in range(doc.page_count):
page = doc.load_page(i)
pagetext = page.get_text().strip()
if pagetext:
result = result + pagetext
if len(doc.get_page_images(i)) > 0:
for img in doc.get_page_images(i):
if img:
pageimg = ""
xref = img[0]
img_data = doc.extract_image(xref)
img_bytes = img_data["image"]
pil_image = Image.open(io.BytesIO(img_bytes))
img = np.array(pil_image)
img_result = reader.readtext(img, paragraph=True, detail=0)
pageimg = pageimg + ", ".join(img_result).strip()
if pageimg.endswith("!") or pageimg.endswith("?") or pageimg.endswith("."):
pass
else:
pageimg = pageimg + "."
result = result + pageimg
return result
def ingest_documents():
"""Ingest PDF to Redis from the data/ directory that
contains Intel manuals."""
# Load list of pdfs
company_name = "Intel"
data_path = "data_intel/"
doc_path = [os.path.join(data_path, file) for file in os.listdir(data_path)][0]
print("Parsing Intel architecture manuals", doc_path)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True)
content = pdf_loader(doc_path)
chunks = text_splitter.split_text(content)
print("Done preprocessing. Created", len(chunks), "chunks of the original pdf")
# Create vectorstore
# Create vectorstore
if tei_embedding_endpoint:
# create embeddings using TEI endpoint service
embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint)
else:
# create embeddings using local embedding model
embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)
# Batch size
batch_size = 32
num_chunks = len(chunks)
for i in range(0, num_chunks, batch_size):
batch_chunks = chunks[i : i + batch_size]
batch_texts = [f"Company: {company_name}. " + chunk for chunk in batch_chunks]
_ = Redis.from_texts(
texts=batch_texts,
embedding=embedder,
index_name=INDEX_NAME,
index_schema=INDEX_SCHEMA,
redis_url=REDIS_URL,
)
print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}")
if __name__ == "__main__":
ingest_documents()