Files
GenAIExamples/ChatQnA/deprecated/langchain/redis/ingest_wiki.py
chen, suyue 509d5c66ca unify license copyright (#234)
Signed-off-by: chensuyue <suyue.chen@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-05-31 17:19:30 +08:00

83 lines
2.7 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#
import io
import os
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import ConfluenceLoader
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings
from langchain_community.vectorstores import Redis
# from PIL import Image
from rag_redis.config import EMBED_MODEL, INDEX_NAME, INDEX_SCHEMA, REDIS_URL
tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")
confluence_access_token = os.getenv("CONFLUENCE_ACCESS_TOKEN")
def wiki_loader(wiki_url, page_ids):
loader = ConfluenceLoader(
url=wiki_url,
token=confluence_access_token,
confluence_kwargs={"verify_ssl": False},
)
print(wiki_url)
print(page_ids)
documents = loader.load(page_ids=page_ids, include_attachments=True, limit=50, max_pages=50)
return documents
def ingest_documents(wiki_url, page_ids):
"""Ingest Wiki Pages to Redis from the variables (wiki_url, page_ids) that
contains your contents of interest."""
# Load list of wiki pages
company_name = "Intel"
print("Parsing Intel wiki pages", page_ids)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True)
documents = wiki_loader(wiki_url, page_ids)
content = ""
for doc in documents:
content += doc.page_content
chunks = text_splitter.split_text(content)
print("Done preprocessing. Created", len(chunks), "chunks of the original pdf")
# Create vectorstore
# Create vectorstore
if tei_embedding_endpoint:
# create embeddings using TEI endpoint service
embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint)
else:
# create embeddings using local embedding model
embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)
# Batch size
batch_size = 2
num_chunks = len(chunks)
for i in range(0, num_chunks, batch_size):
batch_chunks = chunks[i : i + batch_size]
batch_texts = [f"Company: {company_name}. " + chunk for chunk in batch_chunks]
_ = Redis.from_texts(
texts=batch_texts,
embedding=embedder,
index_name=INDEX_NAME,
index_schema=INDEX_SCHEMA,
redis_url=REDIS_URL,
)
print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}")
if __name__ == "__main__":
wiki_url = "https://wiki.ith.intel.com/"
page_ids = [3458609323, 3467299836]
ingest_documents(wiki_url, page_ids)