Files
Zhu Yongbo c9088eb824 Add EdgeCraftRag as a GenAIExample (#1072)
Signed-off-by: ZePan110 <ze.pan@intel.com>
Signed-off-by: chensuyue <suyue.chen@intel.com>
Signed-off-by: Zhu, Yongbo <yongbo.zhu@intel.com>
Signed-off-by: Wang, Xigui <xigui.wang@intel.com>
Co-authored-by: ZePan110 <ze.pan@intel.com>
Co-authored-by: chen, suyue <suyue.chen@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: xiguiw <111278656+xiguiw@users.noreply.github.com>
Co-authored-by: lvliang-intel <liang1.lv@intel.com>
2024-11-08 21:07:24 +08:00

66 lines
2.1 KiB
Python

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path
from typing import Any, List, Optional
from edgecraftrag.base import BaseComponent, CompType, FileType
from llama_index.core.schema import Document
from pydantic import BaseModel, Field, model_serializer
class File(BaseComponent):
file_path: str = Field(default="")
comp_subtype: str = Field(default="")
documents: List[Document] = Field(default=[])
def __init__(self, file_name: Optional[str] = None, file_path: Optional[str] = None, content: Optional[str] = None):
super().__init__(comp_type=CompType.FILE)
if not file_name and not file_path:
raise ValueError("File name or path must be provided")
_path = Path(file_path) if file_path else None
if file_name:
self.name = file_name
else:
self.name = _path.name
self.file_path = _path
self.comp_subtype = FileType.TEXT
if _path and _path.exists():
self.documents.extend(convert_file_to_documents(_path))
if content:
self.documents.extend(convert_text_to_documents(content))
def run(self, **kwargs) -> Any:
pass
@model_serializer
def ser_model(self):
set = {
"file_name": self.name,
"file_id": self.idx,
"file_type": self.comp_subtype,
"file_path": str(self.file_path),
"docs_count": len(self.documents),
}
return set
def convert_text_to_documents(text) -> List[Document]:
return [Document(text=text, metadata={"file_name": "text"})]
def convert_file_to_documents(file_path) -> List[Document]:
from llama_index.core import SimpleDirectoryReader
supported_exts = [".pdf", ".txt", ".doc", ".docx", ".pptx", ".ppt", ".csv", ".md", ".html", ".rst"]
if file_path.is_dir():
docs = SimpleDirectoryReader(input_dir=file_path, recursive=True, required_exts=supported_exts).load_data()
elif file_path.is_file():
docs = SimpleDirectoryReader(input_files=[file_path], required_exts=supported_exts).load_data()
else:
docs = []
return docs