Signed-off-by: ZePan110 <ze.pan@intel.com> Signed-off-by: chensuyue <suyue.chen@intel.com> Signed-off-by: Zhu, Yongbo <yongbo.zhu@intel.com> Signed-off-by: Wang, Xigui <xigui.wang@intel.com> Co-authored-by: ZePan110 <ze.pan@intel.com> Co-authored-by: chen, suyue <suyue.chen@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: xiguiw <111278656+xiguiw@users.noreply.github.com> Co-authored-by: lvliang-intel <liang1.lv@intel.com>
66 lines
2.1 KiB
Python
66 lines
2.1 KiB
Python
# Copyright (C) 2024 Intel Corporation
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
from pathlib import Path
|
|
from typing import Any, List, Optional
|
|
|
|
from edgecraftrag.base import BaseComponent, CompType, FileType
|
|
from llama_index.core.schema import Document
|
|
from pydantic import BaseModel, Field, model_serializer
|
|
|
|
|
|
class File(BaseComponent):
|
|
file_path: str = Field(default="")
|
|
comp_subtype: str = Field(default="")
|
|
documents: List[Document] = Field(default=[])
|
|
|
|
def __init__(self, file_name: Optional[str] = None, file_path: Optional[str] = None, content: Optional[str] = None):
|
|
super().__init__(comp_type=CompType.FILE)
|
|
|
|
if not file_name and not file_path:
|
|
raise ValueError("File name or path must be provided")
|
|
|
|
_path = Path(file_path) if file_path else None
|
|
if file_name:
|
|
self.name = file_name
|
|
else:
|
|
self.name = _path.name
|
|
self.file_path = _path
|
|
self.comp_subtype = FileType.TEXT
|
|
if _path and _path.exists():
|
|
self.documents.extend(convert_file_to_documents(_path))
|
|
if content:
|
|
self.documents.extend(convert_text_to_documents(content))
|
|
|
|
def run(self, **kwargs) -> Any:
|
|
pass
|
|
|
|
@model_serializer
|
|
def ser_model(self):
|
|
set = {
|
|
"file_name": self.name,
|
|
"file_id": self.idx,
|
|
"file_type": self.comp_subtype,
|
|
"file_path": str(self.file_path),
|
|
"docs_count": len(self.documents),
|
|
}
|
|
return set
|
|
|
|
|
|
def convert_text_to_documents(text) -> List[Document]:
|
|
return [Document(text=text, metadata={"file_name": "text"})]
|
|
|
|
|
|
def convert_file_to_documents(file_path) -> List[Document]:
|
|
from llama_index.core import SimpleDirectoryReader
|
|
|
|
supported_exts = [".pdf", ".txt", ".doc", ".docx", ".pptx", ".ppt", ".csv", ".md", ".html", ".rst"]
|
|
if file_path.is_dir():
|
|
docs = SimpleDirectoryReader(input_dir=file_path, recursive=True, required_exts=supported_exts).load_data()
|
|
elif file_path.is_file():
|
|
docs = SimpleDirectoryReader(input_files=[file_path], required_exts=supported_exts).load_data()
|
|
else:
|
|
docs = []
|
|
|
|
return docs
|