Signed-off-by: ZePan110 <ze.pan@intel.com> Signed-off-by: chensuyue <suyue.chen@intel.com> Signed-off-by: Zhu, Yongbo <yongbo.zhu@intel.com> Signed-off-by: Wang, Xigui <xigui.wang@intel.com> Co-authored-by: ZePan110 <ze.pan@intel.com> Co-authored-by: chen, suyue <suyue.chen@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: xiguiw <111278656+xiguiw@users.noreply.github.com> Co-authored-by: lvliang-intel <liang1.lv@intel.com>
86 lines
2.4 KiB
Python
86 lines
2.4 KiB
Python
# Copyright (C) 2024 Intel Corporation
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
from typing import Any
|
|
|
|
from edgecraftrag.base import BaseComponent, CompType, NodeParserType
|
|
from llama_index.core.node_parser import HierarchicalNodeParser, SentenceSplitter, SentenceWindowNodeParser
|
|
from pydantic import model_serializer
|
|
|
|
|
|
class SimpleNodeParser(BaseComponent, SentenceSplitter):
|
|
|
|
# Use super for SentenceSplitter since it's __init__ will cleanup
|
|
# BaseComponent fields
|
|
def __init__(self, **kwargs):
|
|
super().__init__(**kwargs)
|
|
self.comp_type = CompType.NODEPARSER
|
|
self.comp_subtype = NodeParserType.SIMPLE
|
|
|
|
def run(self, **kwargs) -> Any:
|
|
for k, v in kwargs.items():
|
|
if k == "docs":
|
|
return self.get_nodes_from_documents(v, show_progress=False)
|
|
|
|
return None
|
|
|
|
@model_serializer
|
|
def ser_model(self):
|
|
set = {
|
|
"idx": self.idx,
|
|
"parser_type": self.comp_subtype,
|
|
"chunk_size": self.chunk_size,
|
|
"chunk_overlap": self.chunk_overlap,
|
|
}
|
|
return set
|
|
|
|
|
|
class HierarchyNodeParser(BaseComponent, HierarchicalNodeParser):
|
|
|
|
def __init__(self, **kwargs):
|
|
super().__init__(**kwargs)
|
|
self.comp_type = CompType.NODEPARSER
|
|
self.comp_subtype = NodeParserType.HIERARCHY
|
|
|
|
def run(self, **kwargs) -> Any:
|
|
for k, v in kwargs.items():
|
|
if k == "docs":
|
|
return self.get_nodes_from_documents(v, show_progress=False)
|
|
|
|
return None
|
|
|
|
@model_serializer
|
|
def ser_model(self):
|
|
set = {
|
|
"idx": self.idx,
|
|
"parser_type": self.comp_subtype,
|
|
"chunk_size": self.chunk_sizes,
|
|
"chunk_overlap": None,
|
|
}
|
|
return set
|
|
|
|
|
|
class SWindowNodeParser(BaseComponent, SentenceWindowNodeParser):
|
|
|
|
def __init__(self, **kwargs):
|
|
super().__init__(**kwargs)
|
|
self.comp_type = CompType.NODEPARSER
|
|
self.comp_subtype = NodeParserType.SENTENCEWINDOW
|
|
|
|
def run(self, **kwargs) -> Any:
|
|
for k, v in kwargs.items():
|
|
if k == "docs":
|
|
return self.get_nodes_from_documents(v, show_progress=False)
|
|
|
|
return None
|
|
|
|
@model_serializer
|
|
def ser_model(self):
|
|
set = {
|
|
"idx": self.idx,
|
|
"parser_type": self.comp_subtype,
|
|
"chunk_size": None,
|
|
"chunk_overlap": None,
|
|
}
|
|
return set
|