mirror of
https://github.com/langgenius/dify.git
synced 2025-12-19 22:28:46 +00:00
fix: fix fixed_separator (#29861)
This commit is contained in:
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import codecs
|
||||||
import re
|
import re
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
@@ -52,7 +53,7 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
|
|||||||
def __init__(self, fixed_separator: str = "\n\n", separators: list[str] | None = None, **kwargs: Any):
|
def __init__(self, fixed_separator: str = "\n\n", separators: list[str] | None = None, **kwargs: Any):
|
||||||
"""Create a new TextSplitter."""
|
"""Create a new TextSplitter."""
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self._fixed_separator = fixed_separator
|
self._fixed_separator = codecs.decode(fixed_separator, "unicode_escape")
|
||||||
self._separators = separators or ["\n\n", "\n", "。", ". ", " ", ""]
|
self._separators = separators or ["\n\n", "\n", "。", ". ", " ", ""]
|
||||||
|
|
||||||
def split_text(self, text: str) -> list[str]:
|
def split_text(self, text: str) -> list[str]:
|
||||||
|
|||||||
@@ -901,6 +901,13 @@ class TestFixedRecursiveCharacterTextSplitter:
|
|||||||
# Verify no empty chunks
|
# Verify no empty chunks
|
||||||
assert all(len(chunk) > 0 for chunk in result)
|
assert all(len(chunk) > 0 for chunk in result)
|
||||||
|
|
||||||
|
def test_double_slash_n(self):
|
||||||
|
data = "chunk 1\n\nsubchunk 1.\nsubchunk 2.\n\n---\n\nchunk 2\n\nsubchunk 1\nsubchunk 2."
|
||||||
|
separator = "\\n\\n---\\n\\n"
|
||||||
|
splitter = FixedRecursiveCharacterTextSplitter(fixed_separator=separator)
|
||||||
|
chunks = splitter.split_text(data)
|
||||||
|
assert chunks == ["chunk 1\n\nsubchunk 1.\nsubchunk 2.", "chunk 2\n\nsubchunk 1\nsubchunk 2."]
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# Test Metadata Preservation
|
# Test Metadata Preservation
|
||||||
|
|||||||
Reference in New Issue
Block a user