Files
GenAIExamples/AgentQnA/retrieval_tool/index_data.py

78 lines
2.2 KiB
Python

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import argparse
import json
import os
import requests
import tqdm
def get_args():
parser = argparse.ArgumentParser(description="Index data")
parser.add_argument("--host_ip", type=str, default="localhost", help="Host IP")
parser.add_argument("--port", type=int, default=6007, help="Port")
parser.add_argument("--filedir", type=str, default=None, help="file directory")
parser.add_argument("--filename", type=str, default=None, help="file name")
parser.add_argument("--chunk_size", type=int, default=10000, help="Chunk size")
parser.add_argument("--chunk_overlap", type=int, default=0, help="Chunk overlap")
args = parser.parse_args()
return args
def split_jsonl_into_txts(jsonl_file):
docs = []
n = 0
with open(jsonl_file, "r") as f:
for line in f:
data = json.loads(line)
docs.append(data["doc"])
return docs
def write_docs_to_disk(docs, output_folder):
output_files = []
for i, text in enumerate(docs):
output = os.path.join(output_folder, str(i) + ".txt")
output_files.append(output)
with open(output, "w") as f:
f.write(text)
return output_files
def delete_files(files):
for file in files:
os.remove(file)
def main():
args = get_args()
print(args)
host_ip = args.host_ip
port = args.port
proxies = {"http": ""}
url = "http://{host_ip}:{port}/v1/dataprep/ingest".format(host_ip=host_ip, port=port)
# Split jsonl file into json files
files = split_jsonl_into_txts(os.path.join(args.filedir, args.filename))
file_list = write_docs_to_disk(files, args.filedir)
print(file_list)
for file in tqdm.tqdm(file_list):
print("Indexing file: ", file)
files = [("files", (f, open(f, "rb"))) for f in [file]]
payload = {"chunk_size": args.chunk_size, "chunk_overlap": args.chunk_overlap}
resp = requests.request("POST", url=url, headers={}, files=files, data=payload, proxies=proxies)
print(resp.text)
print("Removing temp files....")
delete_files(file_list)
print("ALL DONE!")
if __name__ == "__main__":
main()