Compare commits

...

4 Commits

Author SHA1 Message Date
Coding On Star
5aadaa17db Merge branch 'main' into refactor/step-two 2026-01-07 17:44:19 +08:00
CodingOnStar
88db00019e test: enhance string handling tests for escape/unescape functions
- Updated test case to clarify handling of complex strings without backslashes.
- Added new test case to document behavior for strings containing existing backslashes, highlighting the non-symmetrical nature of escape/unescape functions.
- Improved code readability and understanding of string manipulation scenarios.
2026-01-07 17:43:22 +08:00
CodingOnStar
4b8c37c1f3 feat: implement step two of dataset creation with comprehensive UI components and hooks
- Added new components for general chunking options, parent-child options, preview panel, and step two footer.
- Introduced hooks for document creation, indexing configuration, indexing estimation, preview state, and segmentation state.
- Created types for step two props and integrated them into the components.
- Implemented escape and unescape utility functions for handling special characters.
- Established a structured approach for managing dataset creation workflow, enhancing user experience and functionality.
2026-01-07 15:16:06 +08:00
wangxiaolei
187bfafe8b fix: fix assign value stand as default (#30651)
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
2026-01-07 14:54:11 +08:00
21 changed files with 4618 additions and 1167 deletions

View File

@@ -358,6 +358,25 @@ class AdvancedChatAppGenerateTaskPipeline(GraphRuntimeStateSupport):
if node_finish_resp:
yield node_finish_resp
# For ANSWER nodes, check if we need to send a message_replace event
# Only send if the final output differs from the accumulated task_state.answer
# This happens when variables were updated by variable_assigner during workflow execution
if event.node_type == NodeType.ANSWER and event.outputs:
final_answer = event.outputs.get("answer")
if final_answer is not None and final_answer != self._task_state.answer:
logger.info(
"ANSWER node final output '%s' differs from accumulated answer '%s', sending message_replace event",
final_answer,
self._task_state.answer,
)
# Update the task state answer
self._task_state.answer = str(final_answer)
# Send message_replace event to update the UI
yield self._message_cycle_manager.message_replace_to_stream_response(
answer=str(final_answer),
reason="variable_update",
)
def _handle_node_failed_events(
self,
event: Union[QueueNodeFailedEvent, QueueNodeExceptionEvent],

View File

@@ -0,0 +1,390 @@
"""
Tests for AdvancedChatAppGenerateTaskPipeline._handle_node_succeeded_event method,
specifically testing the ANSWER node message_replace logic.
"""
from datetime import datetime
from types import SimpleNamespace
from unittest.mock import MagicMock, Mock, patch
import pytest
from core.app.entities.app_invoke_entities import AdvancedChatAppGenerateEntity
from core.app.entities.queue_entities import QueueNodeSucceededEvent
from core.workflow.enums import NodeType
from models import EndUser
from models.model import AppMode
class TestAnswerNodeMessageReplace:
"""Test cases for ANSWER node message_replace event logic."""
@pytest.fixture
def mock_application_generate_entity(self):
"""Create a mock application generate entity."""
entity = Mock(spec=AdvancedChatAppGenerateEntity)
entity.task_id = "test-task-id"
entity.app_id = "test-app-id"
entity.workflow_run_id = "test-workflow-run-id"
# minimal app_config used by pipeline internals
entity.app_config = SimpleNamespace(
tenant_id="test-tenant-id",
app_id="test-app-id",
app_mode=AppMode.ADVANCED_CHAT,
app_model_config_dict={},
additional_features=None,
sensitive_word_avoidance=None,
)
entity.query = "test query"
entity.files = []
entity.extras = {}
entity.trace_manager = None
entity.inputs = {}
entity.invoke_from = "debugger"
return entity
@pytest.fixture
def mock_workflow(self):
"""Create a mock workflow."""
workflow = Mock()
workflow.id = "test-workflow-id"
workflow.features_dict = {}
return workflow
@pytest.fixture
def mock_queue_manager(self):
"""Create a mock queue manager."""
manager = Mock()
manager.listen.return_value = []
manager.graph_runtime_state = None
return manager
@pytest.fixture
def mock_conversation(self):
"""Create a mock conversation."""
conversation = Mock()
conversation.id = "test-conversation-id"
conversation.mode = "advanced_chat"
return conversation
@pytest.fixture
def mock_message(self):
"""Create a mock message."""
message = Mock()
message.id = "test-message-id"
message.query = "test query"
message.created_at = Mock()
message.created_at.timestamp.return_value = 1234567890
return message
@pytest.fixture
def mock_user(self):
"""Create a mock end user."""
user = MagicMock(spec=EndUser)
user.id = "test-user-id"
user.session_id = "test-session-id"
return user
@pytest.fixture
def mock_draft_var_saver_factory(self):
"""Create a mock draft variable saver factory."""
return Mock()
@pytest.fixture
def pipeline(
self,
mock_application_generate_entity,
mock_workflow,
mock_queue_manager,
mock_conversation,
mock_message,
mock_user,
mock_draft_var_saver_factory,
):
"""Create an AdvancedChatAppGenerateTaskPipeline instance with mocked dependencies."""
from core.app.apps.advanced_chat.generate_task_pipeline import AdvancedChatAppGenerateTaskPipeline
with patch("core.app.apps.advanced_chat.generate_task_pipeline.db"):
pipeline = AdvancedChatAppGenerateTaskPipeline(
application_generate_entity=mock_application_generate_entity,
workflow=mock_workflow,
queue_manager=mock_queue_manager,
conversation=mock_conversation,
message=mock_message,
user=mock_user,
stream=True,
dialogue_count=1,
draft_var_saver_factory=mock_draft_var_saver_factory,
)
# Initialize workflow run id to avoid validation errors
pipeline._workflow_run_id = "test-workflow-run-id"
# Mock the message cycle manager methods we need to track
pipeline._message_cycle_manager.message_replace_to_stream_response = Mock()
return pipeline
def test_answer_node_with_different_output_sends_message_replace(self, pipeline, mock_application_generate_entity):
"""
Test that when an ANSWER node's final output differs from accumulated answer,
a message_replace event is sent.
"""
# Arrange: Set initial accumulated answer
pipeline._task_state.answer = "initial answer"
# Create ANSWER node succeeded event with different final output
event = QueueNodeSucceededEvent(
node_execution_id="test-node-execution-id",
node_id="test-answer-node",
node_type=NodeType.ANSWER,
start_at=datetime.now(),
outputs={"answer": "updated final answer"},
)
# Mock the workflow response converter to avoid extra processing
pipeline._workflow_response_converter.workflow_node_finish_to_stream_response = Mock(return_value=None)
pipeline._save_output_for_event = Mock()
# Act
responses = list(pipeline._handle_node_succeeded_event(event))
# Assert
assert pipeline._task_state.answer == "updated final answer"
# Verify message_replace was called
pipeline._message_cycle_manager.message_replace_to_stream_response.assert_called_once_with(
answer="updated final answer", reason="variable_update"
)
def test_answer_node_with_same_output_does_not_send_message_replace(self, pipeline):
"""
Test that when an ANSWER node's final output is the same as accumulated answer,
no message_replace event is sent.
"""
# Arrange: Set initial accumulated answer
pipeline._task_state.answer = "same answer"
# Create ANSWER node succeeded event with same output
event = QueueNodeSucceededEvent(
node_execution_id="test-node-execution-id",
node_id="test-answer-node",
node_type=NodeType.ANSWER,
start_at=datetime.now(),
outputs={"answer": "same answer"},
)
# Mock the workflow response converter
pipeline._workflow_response_converter.workflow_node_finish_to_stream_response = Mock(return_value=None)
pipeline._save_output_for_event = Mock()
# Act
list(pipeline._handle_node_succeeded_event(event))
# Assert: answer should remain unchanged
assert pipeline._task_state.answer == "same answer"
# Verify message_replace was NOT called
pipeline._message_cycle_manager.message_replace_to_stream_response.assert_not_called()
def test_answer_node_with_none_output_does_not_send_message_replace(self, pipeline):
"""
Test that when an ANSWER node's output is None or missing 'answer' key,
no message_replace event is sent.
"""
# Arrange: Set initial accumulated answer
pipeline._task_state.answer = "existing answer"
# Create ANSWER node succeeded event with None output
event = QueueNodeSucceededEvent(
node_execution_id="test-node-execution-id",
node_id="test-answer-node",
node_type=NodeType.ANSWER,
start_at=datetime.now(),
outputs={"answer": None},
)
# Mock the workflow response converter
pipeline._workflow_response_converter.workflow_node_finish_to_stream_response = Mock(return_value=None)
pipeline._save_output_for_event = Mock()
# Act
list(pipeline._handle_node_succeeded_event(event))
# Assert: answer should remain unchanged
assert pipeline._task_state.answer == "existing answer"
# Verify message_replace was NOT called
pipeline._message_cycle_manager.message_replace_to_stream_response.assert_not_called()
def test_answer_node_with_empty_outputs_does_not_send_message_replace(self, pipeline):
"""
Test that when an ANSWER node has empty outputs dict,
no message_replace event is sent.
"""
# Arrange: Set initial accumulated answer
pipeline._task_state.answer = "existing answer"
# Create ANSWER node succeeded event with empty outputs
event = QueueNodeSucceededEvent(
node_execution_id="test-node-execution-id",
node_id="test-answer-node",
node_type=NodeType.ANSWER,
start_at=datetime.now(),
outputs={},
)
# Mock the workflow response converter
pipeline._workflow_response_converter.workflow_node_finish_to_stream_response = Mock(return_value=None)
pipeline._save_output_for_event = Mock()
# Act
list(pipeline._handle_node_succeeded_event(event))
# Assert: answer should remain unchanged
assert pipeline._task_state.answer == "existing answer"
# Verify message_replace was NOT called
pipeline._message_cycle_manager.message_replace_to_stream_response.assert_not_called()
def test_answer_node_with_no_answer_key_in_outputs(self, pipeline):
"""
Test that when an ANSWER node's outputs don't contain 'answer' key,
no message_replace event is sent.
"""
# Arrange: Set initial accumulated answer
pipeline._task_state.answer = "existing answer"
# Create ANSWER node succeeded event without 'answer' key in outputs
event = QueueNodeSucceededEvent(
node_execution_id="test-node-execution-id",
node_id="test-answer-node",
node_type=NodeType.ANSWER,
start_at=datetime.now(),
outputs={"other_key": "some value"},
)
# Mock the workflow response converter
pipeline._workflow_response_converter.workflow_node_finish_to_stream_response = Mock(return_value=None)
pipeline._save_output_for_event = Mock()
# Act
list(pipeline._handle_node_succeeded_event(event))
# Assert: answer should remain unchanged
assert pipeline._task_state.answer == "existing answer"
# Verify message_replace was NOT called
pipeline._message_cycle_manager.message_replace_to_stream_response.assert_not_called()
def test_non_answer_node_does_not_send_message_replace(self, pipeline):
"""
Test that non-ANSWER nodes (e.g., LLM, END) don't trigger message_replace events.
"""
# Arrange: Set initial accumulated answer
pipeline._task_state.answer = "existing answer"
# Test with LLM node
llm_event = QueueNodeSucceededEvent(
node_execution_id="test-llm-execution-id",
node_id="test-llm-node",
node_type=NodeType.LLM,
start_at=datetime.now(),
outputs={"answer": "different answer"},
)
# Mock the workflow response converter
pipeline._workflow_response_converter.workflow_node_finish_to_stream_response = Mock(return_value=None)
pipeline._save_output_for_event = Mock()
# Act
list(pipeline._handle_node_succeeded_event(llm_event))
# Assert: answer should remain unchanged
assert pipeline._task_state.answer == "existing answer"
# Verify message_replace was NOT called
pipeline._message_cycle_manager.message_replace_to_stream_response.assert_not_called()
def test_end_node_does_not_send_message_replace(self, pipeline):
"""
Test that END nodes don't trigger message_replace events even with 'answer' output.
"""
# Arrange: Set initial accumulated answer
pipeline._task_state.answer = "existing answer"
# Create END node succeeded event with answer output
event = QueueNodeSucceededEvent(
node_execution_id="test-end-execution-id",
node_id="test-end-node",
node_type=NodeType.END,
start_at=datetime.now(),
outputs={"answer": "different answer"},
)
# Mock the workflow response converter
pipeline._workflow_response_converter.workflow_node_finish_to_stream_response = Mock(return_value=None)
pipeline._save_output_for_event = Mock()
# Act
list(pipeline._handle_node_succeeded_event(event))
# Assert: answer should remain unchanged
assert pipeline._task_state.answer == "existing answer"
# Verify message_replace was NOT called
pipeline._message_cycle_manager.message_replace_to_stream_response.assert_not_called()
def test_answer_node_with_numeric_output_converts_to_string(self, pipeline):
"""
Test that when an ANSWER node's final output is numeric,
it gets converted to string properly.
"""
# Arrange: Set initial accumulated answer
pipeline._task_state.answer = "text answer"
# Create ANSWER node succeeded event with numeric output
event = QueueNodeSucceededEvent(
node_execution_id="test-node-execution-id",
node_id="test-answer-node",
node_type=NodeType.ANSWER,
start_at=datetime.now(),
outputs={"answer": 12345},
)
# Mock the workflow response converter
pipeline._workflow_response_converter.workflow_node_finish_to_stream_response = Mock(return_value=None)
pipeline._save_output_for_event = Mock()
# Act
list(pipeline._handle_node_succeeded_event(event))
# Assert: answer should be converted to string
assert pipeline._task_state.answer == "12345"
# Verify message_replace was called with string
pipeline._message_cycle_manager.message_replace_to_stream_response.assert_called_once_with(
answer="12345", reason="variable_update"
)
def test_answer_node_files_are_recorded(self, pipeline):
"""
Test that ANSWER nodes properly record files from outputs.
"""
# Arrange
pipeline._task_state.answer = "existing answer"
# Create ANSWER node succeeded event with files
event = QueueNodeSucceededEvent(
node_execution_id="test-node-execution-id",
node_id="test-answer-node",
node_type=NodeType.ANSWER,
start_at=datetime.now(),
outputs={
"answer": "same answer",
"files": [
{"type": "image", "transfer_method": "remote_url", "remote_url": "http://example.com/img.png"}
],
},
)
# Mock the workflow response converter
pipeline._workflow_response_converter.fetch_files_from_node_outputs = Mock(return_value=event.outputs["files"])
pipeline._workflow_response_converter.workflow_node_finish_to_stream_response = Mock(return_value=None)
pipeline._save_output_for_event = Mock()
# Act
list(pipeline._handle_node_succeeded_event(event))
# Assert: files should be recorded
assert len(pipeline._recorded_files) == 1
assert pipeline._recorded_files[0] == event.outputs["files"][0]

View File

@@ -0,0 +1,199 @@
'use client'
import type { FC } from 'react'
import type { PreProcessingRule } from '@/models/datasets'
import {
RiAlertFill,
RiSearchEyeLine,
} from '@remixicon/react'
import Image from 'next/image'
import { useTranslation } from 'react-i18next'
import Button from '@/app/components/base/button'
import Checkbox from '@/app/components/base/checkbox'
import Divider from '@/app/components/base/divider'
import Tooltip from '@/app/components/base/tooltip'
import { IS_CE_EDITION } from '@/config'
import { ChunkingMode } from '@/models/datasets'
import SettingCog from '../../assets/setting-gear-mod.svg'
import s from '../index.module.css'
import LanguageSelect from '../language-select'
import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'
import { OptionCard } from './option-card'
type TextLabelProps = {
children: React.ReactNode
}
const TextLabel: FC<TextLabelProps> = ({ children }) => {
return <label className="system-sm-semibold text-text-secondary">{children}</label>
}
type GeneralChunkingOptionsProps = {
// State
segmentIdentifier: string
maxChunkLength: number
overlap: number
rules: PreProcessingRule[]
currentDocForm: ChunkingMode
docLanguage: string
// Flags
isActive: boolean
isInUpload: boolean
isNotUploadInEmptyDataset: boolean
hasCurrentDatasetDocForm: boolean
// Actions
onSegmentIdentifierChange: (value: string) => void
onMaxChunkLengthChange: (value: number) => void
onOverlapChange: (value: number) => void
onRuleToggle: (id: string) => void
onDocFormChange: (form: ChunkingMode) => void
onDocLanguageChange: (lang: string) => void
onPreview: () => void
onReset: () => void
// Locale
locale: string
}
export const GeneralChunkingOptions: FC<GeneralChunkingOptionsProps> = ({
segmentIdentifier,
maxChunkLength,
overlap,
rules,
currentDocForm,
docLanguage,
isActive,
isInUpload,
isNotUploadInEmptyDataset,
hasCurrentDatasetDocForm,
onSegmentIdentifierChange,
onMaxChunkLengthChange,
onOverlapChange,
onRuleToggle,
onDocFormChange,
onDocLanguageChange,
onPreview,
onReset,
locale,
}) => {
const { t } = useTranslation()
const getRuleName = (key: string): string => {
const ruleNameMap: Record<string, string> = {
remove_extra_spaces: t('stepTwo.removeExtraSpaces', { ns: 'datasetCreation' }),
remove_urls_emails: t('stepTwo.removeUrlEmails', { ns: 'datasetCreation' }),
remove_stopwords: t('stepTwo.removeStopwords', { ns: 'datasetCreation' }),
}
return ruleNameMap[key] ?? key
}
return (
<OptionCard
className="mb-2 bg-background-section"
title={t('stepTwo.general', { ns: 'datasetCreation' })}
icon={<Image width={20} height={20} src={SettingCog} alt={t('stepTwo.general', { ns: 'datasetCreation' })} />}
activeHeaderClassName="bg-dataset-option-card-blue-gradient"
description={t('stepTwo.generalTip', { ns: 'datasetCreation' })}
isActive={isActive}
onSwitched={() => onDocFormChange(ChunkingMode.text)}
actions={(
<>
<Button variant="secondary-accent" onClick={onPreview}>
<RiSearchEyeLine className="mr-0.5 h-4 w-4" />
{t('stepTwo.previewChunk', { ns: 'datasetCreation' })}
</Button>
<Button variant="ghost" onClick={onReset}>
{t('stepTwo.reset', { ns: 'datasetCreation' })}
</Button>
</>
)}
noHighlight={isInUpload && isNotUploadInEmptyDataset}
>
<div className="flex flex-col gap-y-4">
<div className="flex gap-3">
<DelimiterInput
value={segmentIdentifier}
onChange={e => onSegmentIdentifierChange(e.target.value)}
/>
<MaxLengthInput
unit="characters"
value={maxChunkLength}
onChange={onMaxChunkLengthChange}
/>
<OverlapInput
unit="characters"
value={overlap}
min={1}
onChange={onOverlapChange}
/>
</div>
<div className="flex w-full flex-col">
<div className="flex items-center gap-x-2">
<div className="inline-flex shrink-0">
<TextLabel>{t('stepTwo.rules', { ns: 'datasetCreation' })}</TextLabel>
</div>
<Divider className="grow" bgStyle="gradient" />
</div>
<div className="mt-1">
{rules.map(rule => (
<div
key={rule.id}
className={s.ruleItem}
onClick={() => onRuleToggle(rule.id)}
>
<Checkbox checked={rule.enabled} />
<label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">
{getRuleName(rule.id)}
</label>
</div>
))}
{IS_CE_EDITION && (
<>
<Divider type="horizontal" className="my-4 bg-divider-subtle" />
<div className="flex items-center py-0.5">
<div
className="flex items-center"
onClick={() => {
if (hasCurrentDatasetDocForm)
return
if (currentDocForm === ChunkingMode.qa)
onDocFormChange(ChunkingMode.text)
else
onDocFormChange(ChunkingMode.qa)
}}
>
<Checkbox
checked={currentDocForm === ChunkingMode.qa}
disabled={hasCurrentDatasetDocForm}
/>
<label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">
{t('stepTwo.useQALanguage', { ns: 'datasetCreation' })}
</label>
</div>
<LanguageSelect
currentLanguage={docLanguage || locale}
onSelect={onDocLanguageChange}
disabled={currentDocForm !== ChunkingMode.qa}
/>
<Tooltip popupContent={t('stepTwo.QATip', { ns: 'datasetCreation' })} />
</div>
{currentDocForm === ChunkingMode.qa && (
<div
style={{
background: 'linear-gradient(92deg, rgba(247, 144, 9, 0.1) 0%, rgba(255, 255, 255, 0.00) 100%)',
}}
className="mt-2 flex h-10 items-center gap-2 rounded-xl border border-components-panel-border px-3 text-xs shadow-xs backdrop-blur-[5px]"
>
<RiAlertFill className="size-4 text-text-warning-secondary" />
<span className="system-xs-medium text-text-primary">
{t('stepTwo.QATip', { ns: 'datasetCreation' })}
</span>
</div>
)}
</>
)}
</div>
</div>
</div>
</OptionCard>
)
}

View File

@@ -0,0 +1,5 @@
export { GeneralChunkingOptions } from './general-chunking-options'
export { IndexingModeSection } from './indexing-mode-section'
export { ParentChildOptions } from './parent-child-options'
export { PreviewPanel } from './preview-panel'
export { StepTwoFooter } from './step-two-footer'

View File

@@ -0,0 +1,253 @@
'use client'
import type { FC } from 'react'
import type { DefaultModel, Model } from '@/app/components/header/account-setting/model-provider-page/declarations'
import type { RetrievalConfig } from '@/types/app'
import Image from 'next/image'
import Link from 'next/link'
import { useTranslation } from 'react-i18next'
import Badge from '@/app/components/base/badge'
import Button from '@/app/components/base/button'
import CustomDialog from '@/app/components/base/dialog'
import Divider from '@/app/components/base/divider'
import { AlertTriangle } from '@/app/components/base/icons/src/vender/solid/alertsAndFeedback'
import Tooltip from '@/app/components/base/tooltip'
import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
import { useDocLink } from '@/context/i18n'
import { ChunkingMode } from '@/models/datasets'
import { cn } from '@/utils/classnames'
import { indexMethodIcon } from '../../icons'
import { IndexingType } from '../hooks'
import s from '../index.module.css'
import { OptionCard } from './option-card'
type IndexingModeSectionProps = {
// State
indexType: IndexingType
hasSetIndexType: boolean
docForm: ChunkingMode
embeddingModel: DefaultModel
embeddingModelList?: Model[]
retrievalConfig: RetrievalConfig
showMultiModalTip: boolean
// Flags
isModelAndRetrievalConfigDisabled: boolean
datasetId?: string
// Modal state
isQAConfirmDialogOpen: boolean
// Actions
onIndexTypeChange: (type: IndexingType) => void
onEmbeddingModelChange: (model: DefaultModel) => void
onRetrievalConfigChange: (config: RetrievalConfig) => void
onQAConfirmDialogClose: () => void
onQAConfirmDialogConfirm: () => void
}
export const IndexingModeSection: FC<IndexingModeSectionProps> = ({
indexType,
hasSetIndexType,
docForm,
embeddingModel,
embeddingModelList,
retrievalConfig,
showMultiModalTip,
isModelAndRetrievalConfigDisabled,
datasetId,
isQAConfirmDialogOpen,
onIndexTypeChange,
onEmbeddingModelChange,
onRetrievalConfigChange,
onQAConfirmDialogClose,
onQAConfirmDialogConfirm,
}) => {
const { t } = useTranslation()
const docLink = useDocLink()
const getIndexingTechnique = () => indexType
return (
<>
{/* Index Mode */}
<div className="system-md-semibold mb-1 text-text-secondary">
{t('stepTwo.indexMode', { ns: 'datasetCreation' })}
</div>
<div className="flex items-center gap-2">
{/* Qualified option */}
{(!hasSetIndexType || (hasSetIndexType && indexType === IndexingType.QUALIFIED)) && (
<OptionCard
className="flex-1 self-stretch"
title={(
<div className="flex items-center">
{t('stepTwo.qualified', { ns: 'datasetCreation' })}
<Badge
className={cn(
'ml-1 h-[18px]',
(!hasSetIndexType && indexType === IndexingType.QUALIFIED)
? 'border-text-accent-secondary text-text-accent-secondary'
: '',
)}
uppercase
>
{t('stepTwo.recommend', { ns: 'datasetCreation' })}
</Badge>
<span className="ml-auto">
{!hasSetIndexType && <span className={cn(s.radio)} />}
</span>
</div>
)}
description={t('stepTwo.qualifiedTip', { ns: 'datasetCreation' })}
icon={<Image src={indexMethodIcon.high_quality} alt="" />}
isActive={!hasSetIndexType && indexType === IndexingType.QUALIFIED}
disabled={hasSetIndexType}
onSwitched={() => onIndexTypeChange(IndexingType.QUALIFIED)}
/>
)}
{/* Economical option */}
{(!hasSetIndexType || (hasSetIndexType && indexType === IndexingType.ECONOMICAL)) && (
<>
<CustomDialog show={isQAConfirmDialogOpen} onClose={onQAConfirmDialogClose} className="w-[432px]">
<header className="mb-4 pt-6">
<h2 className="text-lg font-semibold text-text-primary">
{t('stepTwo.qaSwitchHighQualityTipTitle', { ns: 'datasetCreation' })}
</h2>
<p className="mt-2 text-sm font-normal text-text-secondary">
{t('stepTwo.qaSwitchHighQualityTipContent', { ns: 'datasetCreation' })}
</p>
</header>
<div className="flex gap-2 pb-6">
<Button className="ml-auto" onClick={onQAConfirmDialogClose}>
{t('stepTwo.cancel', { ns: 'datasetCreation' })}
</Button>
<Button variant="primary" onClick={onQAConfirmDialogConfirm}>
{t('stepTwo.switch', { ns: 'datasetCreation' })}
</Button>
</div>
</CustomDialog>
<Tooltip
popupContent={(
<div className="rounded-lg border-components-panel-border bg-components-tooltip-bg p-3 text-xs font-medium text-text-secondary shadow-lg">
{docForm === ChunkingMode.qa
? t('stepTwo.notAvailableForQA', { ns: 'datasetCreation' })
: t('stepTwo.notAvailableForParentChild', { ns: 'datasetCreation' })}
</div>
)}
noDecoration
position="top"
asChild={false}
triggerClassName="flex-1 self-stretch"
>
<OptionCard
className="h-full"
title={t('stepTwo.economical', { ns: 'datasetCreation' })}
description={t('stepTwo.economicalTip', { ns: 'datasetCreation' })}
icon={<Image src={indexMethodIcon.economical} alt="" />}
isActive={!hasSetIndexType && indexType === IndexingType.ECONOMICAL}
disabled={hasSetIndexType || docForm !== ChunkingMode.text}
onSwitched={() => onIndexTypeChange(IndexingType.ECONOMICAL)}
/>
</Tooltip>
</>
)}
</div>
{/* High quality tip */}
{!hasSetIndexType && indexType === IndexingType.QUALIFIED && (
<div className="mt-2 flex h-10 items-center gap-x-0.5 overflow-hidden rounded-xl border-[0.5px] border-components-panel-border bg-components-panel-bg-blur p-2 shadow-xs backdrop-blur-[5px]">
<div className="absolute bottom-0 left-0 right-0 top-0 bg-dataset-warning-message-bg opacity-40"></div>
<div className="p-1">
<AlertTriangle className="size-4 text-text-warning-secondary" />
</div>
<span className="system-xs-medium text-text-primary">
{t('stepTwo.highQualityTip', { ns: 'datasetCreation' })}
</span>
</div>
)}
{/* Economical index setting tip */}
{hasSetIndexType && indexType === IndexingType.ECONOMICAL && (
<div className="system-xs-medium mt-2 text-text-tertiary">
{t('stepTwo.indexSettingTip', { ns: 'datasetCreation' })}
<Link className="text-text-accent" href={`/datasets/${datasetId}/settings`}>
{t('stepTwo.datasetSettingLink', { ns: 'datasetCreation' })}
</Link>
</div>
)}
{/* Embedding model */}
{indexType === IndexingType.QUALIFIED && (
<div className="mt-5">
<div className={cn('system-md-semibold mb-1 text-text-secondary', datasetId && 'flex items-center justify-between')}>
{t('form.embeddingModel', { ns: 'datasetSettings' })}
</div>
<ModelSelector
readonly={isModelAndRetrievalConfigDisabled}
triggerClassName={isModelAndRetrievalConfigDisabled ? 'opacity-50' : ''}
defaultModel={embeddingModel}
modelList={embeddingModelList ?? []}
onSelect={onEmbeddingModelChange}
/>
{isModelAndRetrievalConfigDisabled && (
<div className="system-xs-medium mt-2 text-text-tertiary">
{t('stepTwo.indexSettingTip', { ns: 'datasetCreation' })}
<Link className="text-text-accent" href={`/datasets/${datasetId}/settings`}>
{t('stepTwo.datasetSettingLink', { ns: 'datasetCreation' })}
</Link>
</div>
)}
</div>
)}
<Divider className="my-5" />
{/* Retrieval Method Config */}
<div>
{!isModelAndRetrievalConfigDisabled
? (
<div className="mb-1">
<div className="system-md-semibold mb-0.5 text-text-secondary">
{t('form.retrievalSetting.title', { ns: 'datasetSettings' })}
</div>
<div className="body-xs-regular text-text-tertiary">
<a
target="_blank"
rel="noopener noreferrer"
href={docLink('/guides/knowledge-base/create-knowledge-and-upload-documents')}
className="text-text-accent"
>
{t('form.retrievalSetting.learnMore', { ns: 'datasetSettings' })}
</a>
{t('form.retrievalSetting.longDescription', { ns: 'datasetSettings' })}
</div>
</div>
)
: (
<div className={cn('system-md-semibold mb-0.5 text-text-secondary', 'flex items-center justify-between')}>
<div>{t('form.retrievalSetting.title', { ns: 'datasetSettings' })}</div>
</div>
)}
<div>
{getIndexingTechnique() === IndexingType.QUALIFIED
? (
<RetrievalMethodConfig
disabled={isModelAndRetrievalConfigDisabled}
value={retrievalConfig}
onChange={onRetrievalConfigChange}
showMultiModalTip={showMultiModalTip}
/>
)
: (
<EconomicalRetrievalMethodConfig
disabled={isModelAndRetrievalConfigDisabled}
value={retrievalConfig}
onChange={onRetrievalConfigChange}
/>
)}
</div>
</div>
</>
)
}

View File

@@ -0,0 +1,191 @@
'use client'
import type { FC } from 'react'
import type { ParentChildConfig } from '../hooks'
import type { ParentMode, PreProcessingRule } from '@/models/datasets'
import { RiSearchEyeLine } from '@remixicon/react'
import Image from 'next/image'
import { useTranslation } from 'react-i18next'
import Button from '@/app/components/base/button'
import Checkbox from '@/app/components/base/checkbox'
import Divider from '@/app/components/base/divider'
import { ParentChildChunk } from '@/app/components/base/icons/src/vender/knowledge'
import RadioCard from '@/app/components/base/radio-card'
import { ChunkingMode } from '@/models/datasets'
import FileList from '../../assets/file-list-3-fill.svg'
import Note from '../../assets/note-mod.svg'
import BlueEffect from '../../assets/option-card-effect-blue.svg'
import s from '../index.module.css'
import { DelimiterInput, MaxLengthInput } from './inputs'
import { OptionCard } from './option-card'
type TextLabelProps = {
children: React.ReactNode
}
const TextLabel: FC<TextLabelProps> = ({ children }) => {
return <label className="system-sm-semibold text-text-secondary">{children}</label>
}
type ParentChildOptionsProps = {
// State
parentChildConfig: ParentChildConfig
rules: PreProcessingRule[]
currentDocForm: ChunkingMode
// Flags
isActive: boolean
isInUpload: boolean
isNotUploadInEmptyDataset: boolean
// Actions
onDocFormChange: (form: ChunkingMode) => void
onChunkForContextChange: (mode: ParentMode) => void
onParentDelimiterChange: (value: string) => void
onParentMaxLengthChange: (value: number) => void
onChildDelimiterChange: (value: string) => void
onChildMaxLengthChange: (value: number) => void
onRuleToggle: (id: string) => void
onPreview: () => void
onReset: () => void
}
export const ParentChildOptions: FC<ParentChildOptionsProps> = ({
parentChildConfig,
rules,
currentDocForm: _currentDocForm,
isActive,
isInUpload,
isNotUploadInEmptyDataset,
onDocFormChange,
onChunkForContextChange,
onParentDelimiterChange,
onParentMaxLengthChange,
onChildDelimiterChange,
onChildMaxLengthChange,
onRuleToggle,
onPreview,
onReset,
}) => {
const { t } = useTranslation()
const getRuleName = (key: string): string => {
const ruleNameMap: Record<string, string> = {
remove_extra_spaces: t('stepTwo.removeExtraSpaces', { ns: 'datasetCreation' }),
remove_urls_emails: t('stepTwo.removeUrlEmails', { ns: 'datasetCreation' }),
remove_stopwords: t('stepTwo.removeStopwords', { ns: 'datasetCreation' }),
}
return ruleNameMap[key] ?? key
}
return (
<OptionCard
title={t('stepTwo.parentChild', { ns: 'datasetCreation' })}
icon={<ParentChildChunk className="h-[20px] w-[20px]" />}
effectImg={BlueEffect.src}
className="text-util-colors-blue-light-blue-light-500"
activeHeaderClassName="bg-dataset-option-card-blue-gradient"
description={t('stepTwo.parentChildTip', { ns: 'datasetCreation' })}
isActive={isActive}
onSwitched={() => onDocFormChange(ChunkingMode.parentChild)}
actions={(
<>
<Button variant="secondary-accent" onClick={onPreview}>
<RiSearchEyeLine className="mr-0.5 h-4 w-4" />
{t('stepTwo.previewChunk', { ns: 'datasetCreation' })}
</Button>
<Button variant="ghost" onClick={onReset}>
{t('stepTwo.reset', { ns: 'datasetCreation' })}
</Button>
</>
)}
noHighlight={isInUpload && isNotUploadInEmptyDataset}
>
<div className="flex flex-col gap-4">
{/* Parent chunk for context */}
<div>
<div className="flex items-center gap-x-2">
<div className="inline-flex shrink-0">
<TextLabel>{t('stepTwo.parentChunkForContext', { ns: 'datasetCreation' })}</TextLabel>
</div>
<Divider className="grow" bgStyle="gradient" />
</div>
<RadioCard
className="mt-1"
icon={<Image src={Note} alt="" />}
title={t('stepTwo.paragraph', { ns: 'datasetCreation' })}
description={t('stepTwo.paragraphTip', { ns: 'datasetCreation' })}
isChosen={parentChildConfig.chunkForContext === 'paragraph'}
onChosen={() => onChunkForContextChange('paragraph')}
chosenConfig={(
<div className="flex gap-3">
<DelimiterInput
value={parentChildConfig.parent.delimiter}
tooltip={t('stepTwo.parentChildDelimiterTip', { ns: 'datasetCreation' })!}
onChange={e => onParentDelimiterChange(e.target.value)}
/>
<MaxLengthInput
unit="characters"
value={parentChildConfig.parent.maxLength}
onChange={onParentMaxLengthChange}
/>
</div>
)}
/>
<RadioCard
className="mt-2"
icon={<Image src={FileList} alt="" />}
title={t('stepTwo.fullDoc', { ns: 'datasetCreation' })}
description={t('stepTwo.fullDocTip', { ns: 'datasetCreation' })}
onChosen={() => onChunkForContextChange('full-doc')}
isChosen={parentChildConfig.chunkForContext === 'full-doc'}
/>
</div>
{/* Child chunk for retrieval */}
<div>
<div className="flex items-center gap-x-2">
<div className="inline-flex shrink-0">
<TextLabel>{t('stepTwo.childChunkForRetrieval', { ns: 'datasetCreation' })}</TextLabel>
</div>
<Divider className="grow" bgStyle="gradient" />
</div>
<div className="mt-1 flex gap-3">
<DelimiterInput
value={parentChildConfig.child.delimiter}
tooltip={t('stepTwo.parentChildChunkDelimiterTip', { ns: 'datasetCreation' })!}
onChange={e => onChildDelimiterChange(e.target.value)}
/>
<MaxLengthInput
unit="characters"
value={parentChildConfig.child.maxLength}
onChange={onChildMaxLengthChange}
/>
</div>
</div>
{/* Rules */}
<div>
<div className="flex items-center gap-x-2">
<div className="inline-flex shrink-0">
<TextLabel>{t('stepTwo.rules', { ns: 'datasetCreation' })}</TextLabel>
</div>
<Divider className="grow" bgStyle="gradient" />
</div>
<div className="mt-1">
{rules.map(rule => (
<div
key={rule.id}
className={s.ruleItem}
onClick={() => onRuleToggle(rule.id)}
>
<Checkbox checked={rule.enabled} />
<label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">
{getRuleName(rule.id)}
</label>
</div>
))}
</div>
</div>
</div>
</OptionCard>
)
}

View File

@@ -0,0 +1,171 @@
'use client'
import type { FC } from 'react'
import type { ParentChildConfig } from '../hooks'
import type { DataSourceType, FileIndexingEstimateResponse } from '@/models/datasets'
import { RiSearchEyeLine } from '@remixicon/react'
import { noop } from 'es-toolkit/function'
import { useTranslation } from 'react-i18next'
import Badge from '@/app/components/base/badge'
import FloatRightContainer from '@/app/components/base/float-right-container'
import { SkeletonContainer, SkeletonPoint, SkeletonRectangle, SkeletonRow } from '@/app/components/base/skeleton'
import { FULL_DOC_PREVIEW_LENGTH } from '@/config'
import { ChunkingMode } from '@/models/datasets'
import { cn } from '@/utils/classnames'
import { ChunkContainer, QAPreview } from '../../../chunk'
import PreviewDocumentPicker from '../../../common/document-picker/preview-document-picker'
import { PreviewSlice } from '../../../formatted-text/flavours/preview-slice'
import { FormattedText } from '../../../formatted-text/formatted'
import PreviewContainer from '../../../preview/container'
import { PreviewHeader } from '../../../preview/header'
type PreviewPanelProps = {
// State
isMobile: boolean
dataSourceType: DataSourceType
currentDocForm: ChunkingMode
estimate?: FileIndexingEstimateResponse
parentChildConfig: ParentChildConfig
isSetting?: boolean
// Picker
pickerFiles: Array<{ id: string, name: string, extension: string }>
pickerValue: { id: string, name: string, extension: string }
// Mutation state
isIdle: boolean
isPending: boolean
// Actions
onPickerChange: (selected: { id: string, name: string }) => void
}
export const PreviewPanel: FC<PreviewPanelProps> = ({
isMobile,
dataSourceType: _dataSourceType,
currentDocForm,
estimate,
parentChildConfig,
isSetting,
pickerFiles,
pickerValue,
isIdle,
isPending,
onPickerChange,
}) => {
const { t } = useTranslation()
return (
<FloatRightContainer isMobile={isMobile} isOpen={true} onClose={noop} footer={null}>
<PreviewContainer
header={(
<PreviewHeader title={t('stepTwo.preview', { ns: 'datasetCreation' })}>
<div className="flex items-center gap-1">
<PreviewDocumentPicker
files={pickerFiles as Array<Required<{ id: string, name: string, extension: string }>>}
onChange={onPickerChange}
value={isSetting ? pickerFiles[0] : pickerValue}
/>
{currentDocForm !== ChunkingMode.qa && (
<Badge
text={t('stepTwo.previewChunkCount', {
ns: 'datasetCreation',
count: estimate?.total_segments || 0,
}) as string}
/>
)}
</div>
</PreviewHeader>
)}
className={cn('relative flex h-full w-1/2 shrink-0 p-4 pr-0', isMobile && 'w-full max-w-[524px]')}
mainClassName="space-y-6"
>
{/* QA Preview */}
{currentDocForm === ChunkingMode.qa && estimate?.qa_preview && (
estimate.qa_preview.map((item, index) => (
<ChunkContainer
key={item.question}
label={`Chunk-${index + 1}`}
characterCount={item.question.length + item.answer.length}
>
<QAPreview qa={item} />
</ChunkContainer>
))
)}
{/* Text Preview */}
{currentDocForm === ChunkingMode.text && estimate?.preview && (
estimate.preview.map((item, index) => (
<ChunkContainer
key={item.content}
label={`Chunk-${index + 1}`}
characterCount={item.content.length}
>
{item.content}
</ChunkContainer>
))
)}
{/* Parent-Child Preview */}
{currentDocForm === ChunkingMode.parentChild && estimate?.preview && (
estimate.preview.map((item, index) => {
const indexForLabel = index + 1
const childChunks = parentChildConfig.chunkForContext === 'full-doc'
? item.child_chunks.slice(0, FULL_DOC_PREVIEW_LENGTH)
: item.child_chunks
return (
<ChunkContainer
key={item.content}
label={`Chunk-${indexForLabel}`}
characterCount={item.content.length}
>
<FormattedText>
{childChunks.map((child, childIndex) => {
const childIndexForLabel = childIndex + 1
return (
<PreviewSlice
key={`C-${childIndexForLabel}-${child}`}
label={`C-${childIndexForLabel}`}
text={child}
tooltip={`Child-chunk-${childIndexForLabel} · ${child.length} Characters`}
labelInnerClassName="text-[10px] font-semibold align-bottom leading-7"
dividerClassName="leading-7"
/>
)
})}
</FormattedText>
</ChunkContainer>
)
})
)}
{/* Idle State */}
{isIdle && (
<div className="flex h-full w-full items-center justify-center">
<div className="flex flex-col items-center justify-center gap-3">
<RiSearchEyeLine className="size-10 text-text-empty-state-icon" />
<p className="text-sm text-text-tertiary">
{t('stepTwo.previewChunkTip', { ns: 'datasetCreation' })}
</p>
</div>
</div>
)}
{/* Loading State */}
{isPending && (
<div className="space-y-6">
{Array.from({ length: 10 }, (_, i) => (
<SkeletonContainer key={i}>
<SkeletonRow>
<SkeletonRectangle className="w-20" />
<SkeletonPoint />
<SkeletonRectangle className="w-24" />
</SkeletonRow>
<SkeletonRectangle className="w-full" />
<SkeletonRectangle className="w-full" />
<SkeletonRectangle className="w-[422px]" />
</SkeletonContainer>
))}
</div>
)}
</PreviewContainer>
</FloatRightContainer>
)
}

View File

@@ -0,0 +1,58 @@
'use client'
import type { FC } from 'react'
import { RiArrowLeftLine } from '@remixicon/react'
import { useTranslation } from 'react-i18next'
import Button from '@/app/components/base/button'
type StepTwoFooterProps = {
isSetting?: boolean
isCreating: boolean
onPrevious: () => void
onCreate: () => void
onCancel?: () => void
}
export const StepTwoFooter: FC<StepTwoFooterProps> = ({
isSetting,
isCreating,
onPrevious,
onCreate,
onCancel,
}) => {
const { t } = useTranslation()
if (!isSetting) {
return (
<div className="mt-8 flex items-center py-2">
<Button onClick={onPrevious}>
<RiArrowLeftLine className="mr-1 h-4 w-4" />
{t('stepTwo.previousStep', { ns: 'datasetCreation' })}
</Button>
<Button
className="ml-auto"
loading={isCreating}
variant="primary"
onClick={onCreate}
>
{t('stepTwo.nextStep', { ns: 'datasetCreation' })}
</Button>
</div>
)
}
return (
<div className="mt-8 flex items-center py-2">
<Button
loading={isCreating}
variant="primary"
onClick={onCreate}
>
{t('stepTwo.save', { ns: 'datasetCreation' })}
</Button>
<Button className="ml-2" onClick={onCancel}>
{t('stepTwo.cancel', { ns: 'datasetCreation' })}
</Button>
</div>
)
}

View File

@@ -0,0 +1,14 @@
export { useDocumentCreation } from './use-document-creation'
export type { DocumentCreation, ValidationParams } from './use-document-creation'
export { IndexingType, useIndexingConfig } from './use-indexing-config'
export type { IndexingConfig } from './use-indexing-config'
export { useIndexingEstimate } from './use-indexing-estimate'
export type { IndexingEstimate } from './use-indexing-estimate'
export { usePreviewState } from './use-preview-state'
export type { PreviewState } from './use-preview-state'
export { DEFAULT_MAXIMUM_CHUNK_LENGTH, DEFAULT_OVERLAP, DEFAULT_SEGMENT_IDENTIFIER, defaultParentChildConfig, MAXIMUM_CHUNK_TOKEN_LENGTH, useSegmentationState } from './use-segmentation-state'
export type { ParentChildConfig, SegmentationState } from './use-segmentation-state'

View File

@@ -0,0 +1,279 @@
import type { DefaultModel, Model } from '@/app/components/header/account-setting/model-provider-page/declarations'
import type { NotionPage } from '@/models/common'
import type {
ChunkingMode,
CrawlOptions,
CrawlResultItem,
CreateDocumentReq,
createDocumentResponse,
CustomFile,
FullDocumentDetail,
ProcessRule,
} from '@/models/datasets'
import type { RetrievalConfig, RETRIEVE_METHOD } from '@/types/app'
import { useCallback } from 'react'
import { useTranslation } from 'react-i18next'
import { trackEvent } from '@/app/components/base/amplitude'
import Toast from '@/app/components/base/toast'
import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
import { DataSourceProvider } from '@/models/common'
import {
DataSourceType,
} from '@/models/datasets'
import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument } from '@/service/knowledge/use-create-dataset'
import { useInvalidDatasetList } from '@/service/knowledge/use-dataset'
import { IndexingType } from './use-indexing-config'
import { MAXIMUM_CHUNK_TOKEN_LENGTH } from './use-segmentation-state'
export type UseDocumentCreationOptions = {
datasetId?: string
isSetting?: boolean
documentDetail?: FullDocumentDetail
dataSourceType: DataSourceType
files: CustomFile[]
notionPages: NotionPage[]
notionCredentialId: string
websitePages: CrawlResultItem[]
crawlOptions?: CrawlOptions
websiteCrawlProvider?: DataSourceProvider
websiteCrawlJobId?: string
// Callbacks
onStepChange?: (delta: number) => void
updateIndexingTypeCache?: (type: string) => void
updateResultCache?: (res: createDocumentResponse) => void
updateRetrievalMethodCache?: (method: RETRIEVE_METHOD | '') => void
onSave?: () => void
mutateDatasetRes?: () => void
}
export type ValidationParams = {
segmentationType: string
maxChunkLength: number
limitMaxChunkLength: number
overlap: number
indexType: IndexingType
embeddingModel: DefaultModel
rerankModelList: Model[]
retrievalConfig: RetrievalConfig
}
export const useDocumentCreation = (options: UseDocumentCreationOptions) => {
const { t } = useTranslation()
const {
datasetId,
isSetting,
documentDetail,
dataSourceType,
files,
notionPages,
notionCredentialId,
websitePages,
crawlOptions,
websiteCrawlProvider = DataSourceProvider.jinaReader,
websiteCrawlJobId = '',
onStepChange,
updateIndexingTypeCache,
updateResultCache,
updateRetrievalMethodCache,
onSave,
mutateDatasetRes,
} = options
const createFirstDocumentMutation = useCreateFirstDocument()
const createDocumentMutation = useCreateDocument(datasetId!)
const invalidDatasetList = useInvalidDatasetList()
const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending
// Validate creation params
const validateParams = useCallback((params: ValidationParams): boolean => {
const {
segmentationType,
maxChunkLength,
limitMaxChunkLength,
overlap,
indexType,
embeddingModel,
rerankModelList,
retrievalConfig,
} = params
if (segmentationType === 'general' && overlap > maxChunkLength) {
Toast.notify({ type: 'error', message: t('stepTwo.overlapCheck', { ns: 'datasetCreation' }) })
return false
}
if (segmentationType === 'general' && maxChunkLength > limitMaxChunkLength) {
Toast.notify({
type: 'error',
message: t('stepTwo.maxLengthCheck', { ns: 'datasetCreation', limit: limitMaxChunkLength }),
})
return false
}
if (!isSetting) {
if (indexType === IndexingType.QUALIFIED && (!embeddingModel.model || !embeddingModel.provider)) {
Toast.notify({
type: 'error',
message: t('datasetConfig.embeddingModelRequired', { ns: 'appDebug' }),
})
return false
}
if (!isReRankModelSelected({
rerankModelList,
retrievalConfig,
indexMethod: indexType,
})) {
Toast.notify({ type: 'error', message: t('datasetConfig.rerankModelRequired', { ns: 'appDebug' }) })
return false
}
}
return true
}, [t, isSetting])
// Build creation params
const buildCreationParams = useCallback((
currentDocForm: ChunkingMode,
docLanguage: string,
processRule: ProcessRule,
retrievalConfig: RetrievalConfig,
embeddingModel: DefaultModel,
indexingTechnique: string,
): CreateDocumentReq | null => {
if (isSetting) {
return {
original_document_id: documentDetail?.id,
doc_form: currentDocForm,
doc_language: docLanguage,
process_rule: processRule,
retrieval_model: retrievalConfig,
embedding_model: embeddingModel.model,
embedding_model_provider: embeddingModel.provider,
indexing_technique: indexingTechnique,
} as CreateDocumentReq
}
const params: CreateDocumentReq = {
data_source: {
type: dataSourceType,
info_list: {
data_source_type: dataSourceType,
},
},
indexing_technique: indexingTechnique,
process_rule: processRule,
doc_form: currentDocForm,
doc_language: docLanguage,
retrieval_model: retrievalConfig,
embedding_model: embeddingModel.model,
embedding_model_provider: embeddingModel.provider,
} as CreateDocumentReq
// Add data source specific info
if (dataSourceType === DataSourceType.FILE) {
params.data_source!.info_list.file_info_list = {
file_ids: files.map(file => file.id || '').filter(Boolean),
}
}
if (dataSourceType === DataSourceType.NOTION)
params.data_source!.info_list.notion_info_list = getNotionInfo(notionPages, notionCredentialId)
if (dataSourceType === DataSourceType.WEB) {
params.data_source!.info_list.website_info_list = getWebsiteInfo({
websiteCrawlProvider,
websiteCrawlJobId,
websitePages,
crawlOptions,
})
}
return params
}, [
isSetting,
documentDetail,
dataSourceType,
files,
notionPages,
notionCredentialId,
websitePages,
websiteCrawlProvider,
websiteCrawlJobId,
crawlOptions,
])
// Execute creation
const executeCreation = useCallback(async (
params: CreateDocumentReq,
indexType: IndexingType,
retrievalConfig: RetrievalConfig,
) => {
if (!datasetId) {
await createFirstDocumentMutation.mutateAsync(params, {
onSuccess(data) {
updateIndexingTypeCache?.(indexType)
updateResultCache?.(data)
updateRetrievalMethodCache?.(retrievalConfig.search_method as RETRIEVE_METHOD)
},
})
}
else {
await createDocumentMutation.mutateAsync(params, {
onSuccess(data) {
updateIndexingTypeCache?.(indexType)
updateResultCache?.(data)
updateRetrievalMethodCache?.(retrievalConfig.search_method as RETRIEVE_METHOD)
},
})
}
mutateDatasetRes?.()
invalidDatasetList()
trackEvent('create_datasets', {
data_source_type: dataSourceType,
indexing_technique: indexType,
})
onStepChange?.(+1)
if (isSetting)
onSave?.()
}, [
datasetId,
createFirstDocumentMutation,
createDocumentMutation,
updateIndexingTypeCache,
updateResultCache,
updateRetrievalMethodCache,
mutateDatasetRes,
invalidDatasetList,
dataSourceType,
onStepChange,
isSetting,
onSave,
])
// Validate preview params
const validatePreviewParams = useCallback((maxChunkLength: number): boolean => {
if (maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) {
Toast.notify({
type: 'error',
message: t('stepTwo.maxLengthCheck', { ns: 'datasetCreation', limit: MAXIMUM_CHUNK_TOKEN_LENGTH }),
})
return false
}
return true
}, [t])
return {
isCreating,
validateParams,
buildCreationParams,
executeCreation,
validatePreviewParams,
}
}
export type DocumentCreation = ReturnType<typeof useDocumentCreation>

View File

@@ -0,0 +1,143 @@
import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations'
import type { RetrievalConfig } from '@/types/app'
import { useEffect, useMemo, useState } from 'react'
import { checkShowMultiModalTip } from '@/app/components/datasets/settings/utils'
import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
import { RETRIEVE_METHOD } from '@/types/app'
export enum IndexingType {
QUALIFIED = 'high_quality',
ECONOMICAL = 'economy',
}
const DEFAULT_RETRIEVAL_CONFIG: RetrievalConfig = {
search_method: RETRIEVE_METHOD.semantic,
reranking_enable: false,
reranking_model: {
reranking_provider_name: '',
reranking_model_name: '',
},
top_k: 3,
score_threshold_enabled: false,
score_threshold: 0.5,
}
export type UseIndexingConfigOptions = {
initialIndexType?: IndexingType
initialEmbeddingModel?: DefaultModel
initialRetrievalConfig?: RetrievalConfig
isAPIKeySet: boolean
hasSetIndexType: boolean
}
export const useIndexingConfig = (options: UseIndexingConfigOptions) => {
const {
initialIndexType,
initialEmbeddingModel,
initialRetrievalConfig,
isAPIKeySet,
hasSetIndexType,
} = options
// Rerank model
const {
modelList: rerankModelList,
defaultModel: rerankDefaultModel,
currentModel: isRerankDefaultModelValid,
} = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank)
// Embedding model list
const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding)
const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding)
// Index type state
const [indexType, setIndexType] = useState<IndexingType>(() => {
if (initialIndexType)
return initialIndexType
return isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL
})
// Embedding model state
const [embeddingModel, setEmbeddingModel] = useState<DefaultModel>(
initialEmbeddingModel ?? {
provider: defaultEmbeddingModel?.provider.provider || '',
model: defaultEmbeddingModel?.model || '',
},
)
// Retrieval config state
const [retrievalConfig, setRetrievalConfig] = useState<RetrievalConfig>(
initialRetrievalConfig ?? DEFAULT_RETRIEVAL_CONFIG,
)
// Sync retrieval config with rerank model when available
useEffect(() => {
if (initialRetrievalConfig)
return
setRetrievalConfig({
search_method: RETRIEVE_METHOD.semantic,
reranking_enable: !!isRerankDefaultModelValid,
reranking_model: {
reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider.provider ?? '' : '',
reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '',
},
top_k: 3,
score_threshold_enabled: false,
score_threshold: 0.5,
})
}, [rerankDefaultModel, isRerankDefaultModelValid, initialRetrievalConfig])
// Sync index type with props
useEffect(() => {
if (initialIndexType)
setIndexType(initialIndexType)
else
setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
}, [isAPIKeySet, initialIndexType])
// Show multimodal tip
const showMultiModalTip = useMemo(() => {
return checkShowMultiModalTip({
embeddingModel,
rerankingEnable: retrievalConfig.reranking_enable,
rerankModel: {
rerankingProviderName: retrievalConfig.reranking_model.reranking_provider_name,
rerankingModelName: retrievalConfig.reranking_model.reranking_model_name,
},
indexMethod: indexType,
embeddingModelList,
rerankModelList,
})
}, [embeddingModel, retrievalConfig, indexType, embeddingModelList, rerankModelList])
// Get effective indexing technique
const getIndexingTechnique = () => initialIndexType || indexType
return {
// Index type
indexType,
setIndexType,
hasSetIndexType,
getIndexingTechnique,
// Embedding model
embeddingModel,
setEmbeddingModel,
embeddingModelList,
defaultEmbeddingModel,
// Retrieval config
retrievalConfig,
setRetrievalConfig,
rerankModelList,
rerankDefaultModel,
isRerankDefaultModelValid,
// Computed
showMultiModalTip,
}
}
export type IndexingConfig = ReturnType<typeof useIndexingConfig>

View File

@@ -0,0 +1,123 @@
import type { IndexingType } from './use-indexing-config'
import type { NotionPage } from '@/models/common'
import type { ChunkingMode, CrawlOptions, CrawlResultItem, CustomFile, ProcessRule } from '@/models/datasets'
import { useCallback } from 'react'
import { DataSourceProvider } from '@/models/common'
import { DataSourceType } from '@/models/datasets'
import {
useFetchFileIndexingEstimateForFile,
useFetchFileIndexingEstimateForNotion,
useFetchFileIndexingEstimateForWeb,
} from '@/service/knowledge/use-create-dataset'
export type UseIndexingEstimateOptions = {
dataSourceType: DataSourceType
datasetId?: string
// Document settings
currentDocForm: ChunkingMode
docLanguage: string
// File data source
files: CustomFile[]
previewFileName?: string
// Notion data source
previewNotionPage: NotionPage
notionCredentialId: string
// Website data source
previewWebsitePage: CrawlResultItem
crawlOptions?: CrawlOptions
websiteCrawlProvider?: DataSourceProvider
websiteCrawlJobId?: string
// Processing
indexingTechnique: IndexingType
processRule: ProcessRule
}
export const useIndexingEstimate = (options: UseIndexingEstimateOptions) => {
const {
dataSourceType,
datasetId,
currentDocForm,
docLanguage,
files,
previewFileName,
previewNotionPage,
notionCredentialId,
previewWebsitePage,
crawlOptions,
websiteCrawlProvider,
websiteCrawlJobId,
indexingTechnique,
processRule,
} = options
// File indexing estimate
const fileQuery = useFetchFileIndexingEstimateForFile({
docForm: currentDocForm,
docLanguage,
dataSourceType: DataSourceType.FILE,
files: previewFileName
? [files.find(file => file.name === previewFileName)!]
: files,
indexingTechnique,
processRule,
dataset_id: datasetId!,
})
// Notion indexing estimate
const notionQuery = useFetchFileIndexingEstimateForNotion({
docForm: currentDocForm,
docLanguage,
dataSourceType: DataSourceType.NOTION,
notionPages: [previewNotionPage],
indexingTechnique,
processRule,
dataset_id: datasetId || '',
credential_id: notionCredentialId,
})
// Website indexing estimate
const websiteQuery = useFetchFileIndexingEstimateForWeb({
docForm: currentDocForm,
docLanguage,
dataSourceType: DataSourceType.WEB,
websitePages: [previewWebsitePage],
crawlOptions,
websiteCrawlProvider: websiteCrawlProvider ?? DataSourceProvider.jinaReader,
websiteCrawlJobId: websiteCrawlJobId ?? '',
indexingTechnique,
processRule,
dataset_id: datasetId || '',
})
// Get current mutation based on data source type
const getCurrentMutation = useCallback(() => {
if (dataSourceType === DataSourceType.FILE)
return fileQuery
if (dataSourceType === DataSourceType.NOTION)
return notionQuery
return websiteQuery
}, [dataSourceType, fileQuery, notionQuery, websiteQuery])
const currentMutation = getCurrentMutation()
// Trigger estimate fetch
const fetchEstimate = useCallback(() => {
if (dataSourceType === DataSourceType.FILE)
fileQuery.mutate()
else if (dataSourceType === DataSourceType.NOTION)
notionQuery.mutate()
else
websiteQuery.mutate()
}, [dataSourceType, fileQuery, notionQuery, websiteQuery])
return {
currentMutation,
estimate: currentMutation.data,
isIdle: currentMutation.isIdle,
isPending: currentMutation.isPending,
fetchEstimate,
reset: currentMutation.reset,
}
}
export type IndexingEstimate = ReturnType<typeof useIndexingEstimate>

View File

@@ -0,0 +1,127 @@
import type { NotionPage } from '@/models/common'
import type { CrawlResultItem, CustomFile, DocumentItem, FullDocumentDetail } from '@/models/datasets'
import { useCallback, useState } from 'react'
import { DataSourceType } from '@/models/datasets'
export type UsePreviewStateOptions = {
dataSourceType: DataSourceType
files: CustomFile[]
notionPages: NotionPage[]
websitePages: CrawlResultItem[]
documentDetail?: FullDocumentDetail
datasetId?: string
}
export const usePreviewState = (options: UsePreviewStateOptions) => {
const {
dataSourceType,
files,
notionPages,
websitePages,
documentDetail,
datasetId,
} = options
// File preview state
const [previewFile, setPreviewFile] = useState<DocumentItem>(
(datasetId && documentDetail)
? documentDetail.file
: files[0],
)
// Notion page preview state
const [previewNotionPage, setPreviewNotionPage] = useState<NotionPage>(
(datasetId && documentDetail)
? documentDetail.notion_page
: notionPages[0],
)
// Website page preview state
const [previewWebsitePage, setPreviewWebsitePage] = useState<CrawlResultItem>(
(datasetId && documentDetail)
? documentDetail.website_page
: websitePages[0],
)
// Get preview items for document picker based on data source type
const getPreviewPickerItems = useCallback(() => {
if (dataSourceType === DataSourceType.FILE) {
return files as Array<Required<CustomFile>>
}
if (dataSourceType === DataSourceType.NOTION) {
return notionPages.map(page => ({
id: page.page_id,
name: page.page_name,
extension: 'md',
}))
}
if (dataSourceType === DataSourceType.WEB) {
return websitePages.map(page => ({
id: page.source_url,
name: page.title,
extension: 'md',
}))
}
return []
}, [dataSourceType, files, notionPages, websitePages])
// Get current preview value for picker
const getPreviewPickerValue = useCallback(() => {
if (dataSourceType === DataSourceType.FILE) {
return previewFile as Required<CustomFile>
}
if (dataSourceType === DataSourceType.NOTION) {
return {
id: previewNotionPage?.page_id || '',
name: previewNotionPage?.page_name || '',
extension: 'md',
}
}
if (dataSourceType === DataSourceType.WEB) {
return {
id: previewWebsitePage?.source_url || '',
name: previewWebsitePage?.title || '',
extension: 'md',
}
}
return { id: '', name: '', extension: '' }
}, [dataSourceType, previewFile, previewNotionPage, previewWebsitePage])
// Handle preview change
const handlePreviewChange = useCallback((selected: { id: string, name: string }) => {
if (dataSourceType === DataSourceType.FILE) {
setPreviewFile(selected as DocumentItem)
}
else if (dataSourceType === DataSourceType.NOTION) {
const selectedPage = notionPages.find(page => page.page_id === selected.id)
if (selectedPage)
setPreviewNotionPage(selectedPage)
}
else if (dataSourceType === DataSourceType.WEB) {
const selectedPage = websitePages.find(page => page.source_url === selected.id)
if (selectedPage)
setPreviewWebsitePage(selectedPage)
}
}, [dataSourceType, notionPages, websitePages])
return {
// File preview
previewFile,
setPreviewFile,
// Notion preview
previewNotionPage,
setPreviewNotionPage,
// Website preview
previewWebsitePage,
setPreviewWebsitePage,
// Picker helpers
getPreviewPickerItems,
getPreviewPickerValue,
handlePreviewChange,
}
}
export type PreviewState = ReturnType<typeof usePreviewState>

View File

@@ -0,0 +1,222 @@
import type { ParentMode, PreProcessingRule, ProcessRule, Rules } from '@/models/datasets'
import { useCallback, useState } from 'react'
import { ChunkingMode, ProcessMode } from '@/models/datasets'
import escape from './escape'
import unescape from './unescape'
// Constants
export const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
export const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024
export const DEFAULT_OVERLAP = 50
export const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(
globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000',
10,
)
export type ParentChildConfig = {
chunkForContext: ParentMode
parent: {
delimiter: string
maxLength: number
}
child: {
delimiter: string
maxLength: number
}
}
export const defaultParentChildConfig: ParentChildConfig = {
chunkForContext: 'paragraph',
parent: {
delimiter: '\\n\\n',
maxLength: 1024,
},
child: {
delimiter: '\\n',
maxLength: 512,
},
}
export type UseSegmentationStateOptions = {
initialSegmentationType?: ProcessMode
}
export const useSegmentationState = (options: UseSegmentationStateOptions = {}) => {
const { initialSegmentationType } = options
// Segmentation type (general or parent-child)
const [segmentationType, setSegmentationType] = useState<ProcessMode>(
initialSegmentationType ?? ProcessMode.general,
)
// General chunking settings
const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH)
const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH)
const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
// Pre-processing rules
const [rules, setRules] = useState<PreProcessingRule[]>([])
const [defaultConfig, setDefaultConfig] = useState<Rules>()
// Parent-child config
const [parentChildConfig, setParentChildConfig] = useState<ParentChildConfig>(defaultParentChildConfig)
// Escaped segment identifier setter
const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => {
if (value) {
doSetSegmentIdentifier(escape(value))
}
else {
doSetSegmentIdentifier(canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER)
}
}, [])
// Rule toggle handler
const toggleRule = useCallback((id: string) => {
setRules(prev => prev.map(rule =>
rule.id === id ? { ...rule, enabled: !rule.enabled } : rule,
))
}, [])
// Reset to defaults
const resetToDefaults = useCallback(() => {
if (defaultConfig) {
setSegmentIdentifier(defaultConfig.segmentation.separator)
setMaxChunkLength(defaultConfig.segmentation.max_tokens)
setOverlap(defaultConfig.segmentation.chunk_overlap!)
setRules(defaultConfig.pre_processing_rules)
}
setParentChildConfig(defaultParentChildConfig)
}, [defaultConfig, setSegmentIdentifier])
// Apply config from document detail
const applyConfigFromRules = useCallback((rulesConfig: Rules, isHierarchical: boolean) => {
const separator = rulesConfig.segmentation.separator
const max = rulesConfig.segmentation.max_tokens
const chunkOverlap = rulesConfig.segmentation.chunk_overlap
setSegmentIdentifier(separator)
setMaxChunkLength(max)
setOverlap(chunkOverlap!)
setRules(rulesConfig.pre_processing_rules)
setDefaultConfig(rulesConfig)
if (isHierarchical) {
setParentChildConfig({
chunkForContext: rulesConfig.parent_mode || 'paragraph',
parent: {
delimiter: escape(rulesConfig.segmentation.separator),
maxLength: rulesConfig.segmentation.max_tokens,
},
child: {
delimiter: escape(rulesConfig.subchunk_segmentation!.separator),
maxLength: rulesConfig.subchunk_segmentation!.max_tokens,
},
})
}
}, [setSegmentIdentifier])
// Get process rule for API
const getProcessRule = useCallback((docForm: ChunkingMode): ProcessRule => {
if (docForm === ChunkingMode.parentChild) {
return {
rules: {
pre_processing_rules: rules,
segmentation: {
separator: unescape(parentChildConfig.parent.delimiter),
max_tokens: parentChildConfig.parent.maxLength,
},
parent_mode: parentChildConfig.chunkForContext,
subchunk_segmentation: {
separator: unescape(parentChildConfig.child.delimiter),
max_tokens: parentChildConfig.child.maxLength,
},
},
mode: 'hierarchical',
} as ProcessRule
}
return {
rules: {
pre_processing_rules: rules,
segmentation: {
separator: unescape(segmentIdentifier),
max_tokens: maxChunkLength,
chunk_overlap: overlap,
},
},
mode: segmentationType,
} as ProcessRule
}, [rules, parentChildConfig, segmentIdentifier, maxChunkLength, overlap, segmentationType])
// Update parent config field
const updateParentConfig = useCallback((field: 'delimiter' | 'maxLength', value: string | number) => {
setParentChildConfig((prev) => {
let newValue: string | number
if (field === 'delimiter')
newValue = value ? escape(value as string) : ''
else
newValue = value
return {
...prev,
parent: { ...prev.parent, [field]: newValue },
}
})
}, [])
// Update child config field
const updateChildConfig = useCallback((field: 'delimiter' | 'maxLength', value: string | number) => {
setParentChildConfig((prev) => {
let newValue: string | number
if (field === 'delimiter')
newValue = value ? escape(value as string) : ''
else
newValue = value
return {
...prev,
child: { ...prev.child, [field]: newValue },
}
})
}, [])
// Set chunk for context mode
const setChunkForContext = useCallback((mode: ParentMode) => {
setParentChildConfig(prev => ({ ...prev, chunkForContext: mode }))
}, [])
return {
// General chunking state
segmentationType,
setSegmentationType,
segmentIdentifier,
setSegmentIdentifier,
maxChunkLength,
setMaxChunkLength,
limitMaxChunkLength,
setLimitMaxChunkLength,
overlap,
setOverlap,
// Rules
rules,
setRules,
defaultConfig,
setDefaultConfig,
toggleRule,
// Parent-child config
parentChildConfig,
setParentChildConfig,
updateParentConfig,
updateChildConfig,
setChunkForContext,
// Actions
resetToDefaults,
applyConfigFromRules,
getProcessRule,
}
}
export type SegmentationState = ReturnType<typeof useSegmentationState>

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,28 @@
import type { IndexingType } from './hooks'
import type { DataSourceProvider, NotionPage } from '@/models/common'
import type { CrawlOptions, CrawlResultItem, createDocumentResponse, CustomFile, DataSourceType, FullDocumentDetail } from '@/models/datasets'
import type { RETRIEVE_METHOD } from '@/types/app'
export type StepTwoProps = {
isSetting?: boolean
documentDetail?: FullDocumentDetail
isAPIKeySet: boolean
onSetting: () => void
datasetId?: string
indexingType?: IndexingType
retrievalMethod?: string
dataSourceType: DataSourceType
files: CustomFile[]
notionPages?: NotionPage[]
notionCredentialId: string
websitePages?: CrawlResultItem[]
crawlOptions?: CrawlOptions
websiteCrawlProvider?: DataSourceProvider
websiteCrawlJobId?: string
onStepChange?: (delta: number) => void
updateIndexingTypeCache?: (type: string) => void
updateRetrievalMethodCache?: (method: RETRIEVE_METHOD | '') => void
updateResultCache?: (res: createDocumentResponse) => void
onSave?: () => void
onCancel?: () => void
}