fix: fix fixed_separator (#29861)

This commit is contained in:
wangxiaolei
2025-12-18 16:50:44 +08:00
committed by GitHub
parent 9f24cff9dd
commit 78ca5ad142
2 changed files with 9 additions and 1 deletions

View File

@@ -2,6 +2,7 @@
from __future__ import annotations
import codecs
import re
from typing import Any
@@ -52,7 +53,7 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
def __init__(self, fixed_separator: str = "\n\n", separators: list[str] | None = None, **kwargs: Any):
"""Create a new TextSplitter."""
super().__init__(**kwargs)
self._fixed_separator = fixed_separator
self._fixed_separator = codecs.decode(fixed_separator, "unicode_escape")
self._separators = separators or ["\n\n", "\n", "", ". ", " ", ""]
def split_text(self, text: str) -> list[str]:

View File

@@ -901,6 +901,13 @@ class TestFixedRecursiveCharacterTextSplitter:
# Verify no empty chunks
assert all(len(chunk) > 0 for chunk in result)
def test_double_slash_n(self):
data = "chunk 1\n\nsubchunk 1.\nsubchunk 2.\n\n---\n\nchunk 2\n\nsubchunk 1\nsubchunk 2."
separator = "\\n\\n---\\n\\n"
splitter = FixedRecursiveCharacterTextSplitter(fixed_separator=separator)
chunks = splitter.split_text(data)
assert chunks == ["chunk 1\n\nsubchunk 1.\nsubchunk 2.", "chunk 2\n\nsubchunk 1\nsubchunk 2."]
# ============================================================================
# Test Metadata Preservation