GenAIExamples/EdgeCraftRAG/edgecraftrag/components/data.py

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

from pathlib import Path
from typing import Any, List, Optional

from edgecraftrag.base import BaseComponent, CompType, FileType
from llama_index.core.schema import Document
from pydantic import BaseModel, Field, model_serializer


class File(BaseComponent):
    file_path: str = Field(default="")
    comp_subtype: str = Field(default="")
    documents: List[Document] = Field(default=[])

    def __init__(self, file_name: Optional[str] = None, file_path: Optional[str] = None, content: Optional[str] = None):
        super().__init__(comp_type=CompType.FILE)

        if not file_name and not file_path:
            raise ValueError("File name or path must be provided")

        _path = Path(file_path) if file_path else None
        if file_name:
            self.name = file_name
        else:
            self.name = _path.name
        self.file_path = _path
        self.comp_subtype = FileType.TEXT
        if _path and _path.exists():
            self.documents.extend(convert_file_to_documents(_path))
        if content:
            self.documents.extend(convert_text_to_documents(content))

    def run(self, **kwargs) -> Any:
        pass

    @model_serializer
    def ser_model(self):
        set = {
            "file_name": self.name,
            "file_id": self.idx,
            "file_type": self.comp_subtype,
            "file_path": str(self.file_path),
            "docs_count": len(self.documents),
        }
        return set


def convert_text_to_documents(text) -> List[Document]:
    return [Document(text=text, metadata={"file_name": "text"})]


def convert_file_to_documents(file_path) -> List[Document]:
    from llama_index.core import SimpleDirectoryReader

    supported_exts = [".pdf", ".txt", ".doc", ".docx", ".pptx", ".ppt", ".csv", ".md", ".html", ".rst"]
    if file_path.is_dir():
        docs = SimpleDirectoryReader(input_dir=file_path, recursive=True, required_exts=supported_exts).load_data()
    elif file_path.is_file():
        docs = SimpleDirectoryReader(input_files=[file_path], required_exts=supported_exts).load_data()
    else:
        docs = []

    return docs