From 997b9e36a6a8b99a060189a9000ff6ff227cc254 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Tue, 18 Nov 2025 16:38:35 +0800 Subject: [PATCH 1/7] multimodal embedding update segment --- api/configs/feature/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/api/configs/feature/__init__.py b/api/configs/feature/__init__.py index 78e1471136..9c9fb1a7a2 100644 --- a/api/configs/feature/__init__.py +++ b/api/configs/feature/__init__.py @@ -360,13 +360,11 @@ class FileUploadConfig(BaseSettings): default=10, ) - IMAGE_FILE_BATCH_LIMIT: PositiveInt = Field( description="Maximum number of files allowed in a image batch upload operation", default=10, ) - SINGLE_CHUNK_ATTACHMENT_LIMIT: PositiveInt = Field( description="Maximum number of files allowed in a single chunk attachment", default=10, From 530120a1f1413e06737a42ed64713e063ccaab7d Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Fri, 21 Nov 2025 14:20:45 +0800 Subject: [PATCH 2/7] multimodal embedding update segment --- .../index_processor/index_processor_base.py | 34 ++++++++++++++----- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/api/core/rag/index_processor/index_processor_base.py b/api/core/rag/index_processor/index_processor_base.py index d01e13095d..48895b418d 100644 --- a/api/core/rag/index_processor/index_processor_base.py +++ b/api/core/rag/index_processor/index_processor_base.py @@ -1,8 +1,8 @@ """Abstract interface for document loader implementations.""" +import re from abc import ABC, abstractmethod from collections.abc import Mapping -import re from typing import TYPE_CHECKING, Any, Optional from configs import dify_config @@ -15,10 +15,10 @@ from core.rag.splitter.fixed_text_splitter import ( FixedRecursiveCharacterTextSplitter, ) from core.rag.splitter.text_splitter import TextSplitter +from extensions.ext_database import db from models.dataset import Dataset, DatasetProcessRule from models.dataset import Document as DatasetDocument from models.model import UploadFile -from extensions.ext_database import db if TYPE_CHECKING: from core.model_manager import ModelInstance @@ -114,19 +114,23 @@ class BaseIndexProcessor(ABC): """ multi_model_documents = [] text = document.page_content + + # Collect all upload_file_ids including duplicates to preserve occurrence count + upload_file_id_list = [] + + # For data before v0.10.0 pattern = r"/files/([a-f0-9\-]+)/image-preview(?:\?.*?)?" matches = re.finditer(pattern, text) - upload_file_ids = [] for match in matches: upload_file_id = match.group(1) - upload_file_ids.append(upload_file_id) + upload_file_id_list.append(upload_file_id) # For data after v0.10.0 pattern = r"/files/([a-f0-9\-]+)/file-preview(?:\?.*?)?" matches = re.finditer(pattern, text) for match in matches: upload_file_id = match.group(1) - upload_file_ids.append(upload_file_id) + upload_file_id_list.append(upload_file_id) # For tools directory - direct file formats (e.g., .png, .jpg, etc.) # Match URL including any query parameters up to common URL boundaries (space, parenthesis, quotes) @@ -134,10 +138,22 @@ class BaseIndexProcessor(ABC): matches = re.finditer(pattern, text) for match in matches: upload_file_id = match.group(1) - upload_file_ids.append(upload_file_id) - upload_files = db.session.query(UploadFile).filter(UploadFile.id.in_(upload_file_ids)).all() - if upload_files: - for upload_file in upload_files: + upload_file_id_list.append(upload_file_id) + + if not upload_file_id_list: + return multi_model_documents + + # Get unique IDs for database query + unique_upload_file_ids = list(set(upload_file_id_list)) + upload_files = db.session.query(UploadFile).filter(UploadFile.id.in_(unique_upload_file_ids)).all() + + # Create a mapping from ID to UploadFile for quick lookup + upload_file_map = {upload_file.id: upload_file for upload_file in upload_files} + + # Create a Document for each occurrence (including duplicates) + for upload_file_id in upload_file_id_list: + upload_file = upload_file_map.get(upload_file_id) + if upload_file: multi_model_documents.append(Document( page_content=upload_file.name, metadata={ From 5a6b8bba3b72c7af41edf2f8cb11fd8468bf5f99 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Fri, 21 Nov 2025 14:29:35 +0800 Subject: [PATCH 3/7] multimodal embedding update segment --- api/core/tools/utils/text_processing_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/core/tools/utils/text_processing_utils.py b/api/core/tools/utils/text_processing_utils.py index 105823f896..80c69e94c8 100644 --- a/api/core/tools/utils/text_processing_utils.py +++ b/api/core/tools/utils/text_processing_utils.py @@ -13,5 +13,5 @@ def remove_leading_symbols(text: str) -> str: """ # Match Unicode ranges for punctuation and symbols # FIXME this pattern is confused quick fix for #11868 maybe refactor it later - pattern = r"^[\u2000-\u206F\u2E00-\u2E7F\u3000-\u303F!\"#$%&'()*+,./:;<=>?@^_`~]+" + pattern = r"^[\u2000-\u206F\u2E00-\u2E7F\u3000-\u303F\"#$%&'()*+,./:;<=>?@^_`~]+" return re.sub(pattern, "", text) From f41c6e81403bb81acad316c290905c6a62c494a5 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Tue, 25 Nov 2025 17:22:48 +0800 Subject: [PATCH 4/7] multimodal embedding update segment --- .../rag/index_processor/processor/paragraph_index_processor.py | 2 +- .../index_processor/processor/parent_child_index_processor.py | 2 +- api/core/rag/index_processor/processor/qa_index_processor.py | 2 +- api/services/vector_service.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/api/core/rag/index_processor/processor/paragraph_index_processor.py b/api/core/rag/index_processor/processor/paragraph_index_processor.py index 0c4ca2fec5..4977515c1d 100644 --- a/api/core/rag/index_processor/processor/paragraph_index_processor.py +++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py @@ -92,7 +92,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor): if dataset.indexing_technique == "high_quality": vector = Vector(dataset) vector.create(documents) - if multimodel_documents: + if multimodel_documents and dataset.is_multimodal: vector.create_multimodel(multimodel_documents) with_keywords = False if with_keywords: diff --git a/api/core/rag/index_processor/processor/parent_child_index_processor.py b/api/core/rag/index_processor/processor/parent_child_index_processor.py index 9b7d34ebf4..106a956d68 100644 --- a/api/core/rag/index_processor/processor/parent_child_index_processor.py +++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py @@ -128,7 +128,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor): Document.model_validate(child_document.model_dump()) for child_document in child_documents ] vector.create(formatted_child_documents) - if multimodel_documents: + if multimodel_documents and dataset.is_multimodal: vector.create_multimodel(multimodel_documents) def clean(self, dataset: Dataset, node_ids: list[str] | None, with_keywords: bool = True, **kwargs): diff --git a/api/core/rag/index_processor/processor/qa_index_processor.py b/api/core/rag/index_processor/processor/qa_index_processor.py index 25524be1b1..bb8f043407 100644 --- a/api/core/rag/index_processor/processor/qa_index_processor.py +++ b/api/core/rag/index_processor/processor/qa_index_processor.py @@ -139,7 +139,7 @@ class QAIndexProcessor(BaseIndexProcessor): if dataset.indexing_technique == "high_quality": vector = Vector(dataset) vector.create(documents) - if multimodel_documents: + if multimodel_documents and dataset.is_multimodal: vector.create_multimodel(multimodel_documents) def clean(self, dataset: Dataset, node_ids: list[str] | None, with_keywords: bool = True, **kwargs): diff --git a/api/services/vector_service.py b/api/services/vector_service.py index f2056f38ed..25b1964ec3 100644 --- a/api/services/vector_service.py +++ b/api/services/vector_service.py @@ -95,7 +95,7 @@ class VectorService: if len(documents) > 0: index_processor.load( - dataset, documents, multimodel_documents, with_keywords=True, keywords_list=keywords_list + dataset, documents, multimodel_documents=None, with_keywords=True, keywords_list=keywords_list ) if len(multimodel_documents) > 0: index_processor.load(dataset, multimodel_documents, with_keywords=False) From c9fd47f1db4ddd5d4fc82b41d93ea077b91fe039 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Tue, 25 Nov 2025 19:43:50 +0800 Subject: [PATCH 5/7] multimodal embedding update segment --- api/core/indexing_runner.py | 26 +++++++++++-------- api/core/rag/datasource/vdb/vector_factory.py | 2 +- .../index_processor/index_processor_base.py | 16 ++++++------ .../processor/paragraph_index_processor.py | 8 +++--- .../processor/parent_child_index_processor.py | 8 +++--- .../processor/qa_index_processor.py | 8 +++--- api/services/dataset_service.py | 26 ++++++++++++++----- api/services/vector_service.py | 14 +++++----- api/tasks/add_document_to_index_task.py | 18 +++++++++++-- api/tasks/deal_dataset_index_update_task.py | 20 +++++++++++--- api/tasks/deal_dataset_vector_index_task.py | 20 +++++++++++--- api/tasks/delete_segment_from_index_task.py | 15 ++++++++--- api/tasks/disable_segments_from_index_task.py | 10 ++++++- api/tasks/enable_segment_to_index_task.py | 19 ++++++++++++-- api/tasks/enable_segments_to_index_task.py | 19 ++++++++++++-- 15 files changed, 167 insertions(+), 62 deletions(-) diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index f51863c939..87f266d6a9 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -551,7 +551,8 @@ class IndexingRunner: indexing_start_at = time.perf_counter() tokens = 0 create_keyword_thread = None - if dataset_document.doc_form != IndexStructureType.PARENT_CHILD_INDEX and dataset.indexing_technique == "economy": + if (dataset_document.doc_form != IndexStructureType.PARENT_CHILD_INDEX + and dataset.indexing_technique == "economy"): # create keyword index create_keyword_thread = threading.Thread( target=self._process_keyword_index, @@ -635,11 +636,11 @@ class IndexingRunner: db.session.commit() def _process_chunk( - self, flask_app: Flask, - index_processor: BaseIndexProcessor, - chunk_documents: list[Document], - dataset: Dataset, - dataset_document: DatasetDocument, + self, flask_app: Flask, + index_processor: BaseIndexProcessor, + chunk_documents: list[Document], + dataset: Dataset, + dataset_document: DatasetDocument, embedding_model_instance: ModelInstance | None ): with flask_app.app_context(): @@ -651,13 +652,16 @@ class IndexingRunner: page_content_list = [document.page_content for document in chunk_documents] tokens += sum(embedding_model_instance.get_text_embedding_num_tokens(page_content_list)) - multimodel_documents = [] + multimodal_documents = [] for document in chunk_documents: - if document.attachments: - multimodel_documents.extend(document.attachments) + if document.attachments and dataset.is_multimodal: + multimodal_documents.extend(document.attachments) # load index - index_processor.load(dataset, chunk_documents, multimodel_documents=multimodel_documents, with_keywords=False) + index_processor.load(dataset, + chunk_documents, + multimodal_documents=multimodal_documents, + with_keywords=False) document_ids = [document.metadata["doc_id"] for document in chunk_documents] db.session.query(DocumentSegment).where( @@ -754,7 +758,7 @@ class IndexingRunner: ) # add document segments - doc_store.add_documents(docs=documents, + doc_store.add_documents(docs=documents, save_child=dataset_document.doc_form == IndexStructureType.PARENT_CHILD_INDEX) # update document status to indexing diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index 13b5d8aad1..058a29f0b7 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -208,7 +208,7 @@ class Vector: self._vector_processor.create(texts=batch, embeddings=batch_embeddings, **kwargs) logger.info("Embedding %s texts took %s s", len(texts), time.time() - start) - def create_multimodel(self, file_documents: list | None = None, **kwargs): + def create_multimodal(self, file_documents: list | None = None, **kwargs): if file_documents: start = time.time() logger.info("start embedding %s files %s", len(file_documents), start) diff --git a/api/core/rag/index_processor/index_processor_base.py b/api/core/rag/index_processor/index_processor_base.py index 48895b418d..477d9d2faf 100644 --- a/api/core/rag/index_processor/index_processor_base.py +++ b/api/core/rag/index_processor/index_processor_base.py @@ -40,7 +40,7 @@ class BaseIndexProcessor(ABC): self, dataset: Dataset, documents: list[Document], - multimodel_documents: list[Document] | None = None, + multimodal_documents: list[AttachmentDocument] | None = None, with_keywords: bool = True, **kwargs, ): @@ -114,10 +114,10 @@ class BaseIndexProcessor(ABC): """ multi_model_documents = [] text = document.page_content - + # Collect all upload_file_ids including duplicates to preserve occurrence count upload_file_id_list = [] - + # For data before v0.10.0 pattern = r"/files/([a-f0-9\-]+)/image-preview(?:\?.*?)?" matches = re.finditer(pattern, text) @@ -139,22 +139,22 @@ class BaseIndexProcessor(ABC): for match in matches: upload_file_id = match.group(1) upload_file_id_list.append(upload_file_id) - + if not upload_file_id_list: return multi_model_documents - + # Get unique IDs for database query unique_upload_file_ids = list(set(upload_file_id_list)) upload_files = db.session.query(UploadFile).filter(UploadFile.id.in_(unique_upload_file_ids)).all() - + # Create a mapping from ID to UploadFile for quick lookup upload_file_map = {upload_file.id: upload_file for upload_file in upload_files} - + # Create a Document for each occurrence (including duplicates) for upload_file_id in upload_file_id_list: upload_file = upload_file_map.get(upload_file_id) if upload_file: - multi_model_documents.append(Document( + multi_model_documents.append(AttachmentDocument( page_content=upload_file.name, metadata={ "doc_id": upload_file.id, diff --git a/api/core/rag/index_processor/processor/paragraph_index_processor.py b/api/core/rag/index_processor/processor/paragraph_index_processor.py index 4977515c1d..defe9dd72f 100644 --- a/api/core/rag/index_processor/processor/paragraph_index_processor.py +++ b/api/core/rag/index_processor/processor/paragraph_index_processor.py @@ -85,15 +85,15 @@ class ParagraphIndexProcessor(BaseIndexProcessor): self, dataset: Dataset, documents: list[Document], - multimodel_documents: list[Document] | None = None, + multimodal_documents: list[AttachmentDocument] | None = None, with_keywords: bool = True, **kwargs, ): if dataset.indexing_technique == "high_quality": vector = Vector(dataset) vector.create(documents) - if multimodel_documents and dataset.is_multimodal: - vector.create_multimodel(multimodel_documents) + if multimodal_documents and dataset.is_multimodal: + vector.create_multimodal(multimodal_documents) with_keywords = False if with_keywords: keywords_list = kwargs.get("keywords_list") @@ -201,7 +201,7 @@ class ParagraphIndexProcessor(BaseIndexProcessor): vector = Vector(dataset) vector.create(documents) if all_multimodal_documents: - vector.create_multimodel(all_multimodal_documents) + vector.create_multimodal(all_multimodal_documents) elif dataset.indexing_technique == "economy": keyword = Keyword(dataset) keyword.add_texts(documents) diff --git a/api/core/rag/index_processor/processor/parent_child_index_processor.py b/api/core/rag/index_processor/processor/parent_child_index_processor.py index 106a956d68..d52be7a0e0 100644 --- a/api/core/rag/index_processor/processor/parent_child_index_processor.py +++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py @@ -115,7 +115,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor): self, dataset: Dataset, documents: list[Document], - multimodel_documents: list[Document] | None = None, + multimodal_documents: list[AttachmentDocument] | None = None, with_keywords: bool = True, **kwargs, ): @@ -128,8 +128,8 @@ class ParentChildIndexProcessor(BaseIndexProcessor): Document.model_validate(child_document.model_dump()) for child_document in child_documents ] vector.create(formatted_child_documents) - if multimodel_documents and dataset.is_multimodal: - vector.create_multimodel(multimodel_documents) + if multimodal_documents and dataset.is_multimodal: + vector.create_multimodal(multimodal_documents) def clean(self, dataset: Dataset, node_ids: list[str] | None, with_keywords: bool = True, **kwargs): # node_ids is segment's node_ids @@ -308,7 +308,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor): if all_child_documents: vector.create(all_child_documents) if all_multimodal_documents: - vector.create_multimodel(all_multimodal_documents) + vector.create_multimodal(all_multimodal_documents) def format_preview(self, chunks: Any) -> Mapping[str, Any]: parent_childs = ParentChildStructureChunk.model_validate(chunks) diff --git a/api/core/rag/index_processor/processor/qa_index_processor.py b/api/core/rag/index_processor/processor/qa_index_processor.py index bb8f043407..1ba67c602a 100644 --- a/api/core/rag/index_processor/processor/qa_index_processor.py +++ b/api/core/rag/index_processor/processor/qa_index_processor.py @@ -20,7 +20,7 @@ from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.extractor.extract_processor import ExtractProcessor from core.rag.index_processor.constant.index_type import IndexStructureType from core.rag.index_processor.index_processor_base import BaseIndexProcessor -from core.rag.models.document import Document, QAStructureChunk +from core.rag.models.document import Document, QAStructureChunk, AttachmentDocument from core.rag.retrieval.retrieval_methods import RetrievalMethod from core.tools.utils.text_processing_utils import remove_leading_symbols from libs import helper @@ -132,15 +132,15 @@ class QAIndexProcessor(BaseIndexProcessor): self, dataset: Dataset, documents: list[Document], - multimodel_documents: list[Document] | None = None, + multimodal_documents: list[AttachmentDocument] | None = None, with_keywords: bool = True, **kwargs, ): if dataset.indexing_technique == "high_quality": vector = Vector(dataset) vector.create(documents) - if multimodel_documents and dataset.is_multimodal: - vector.create_multimodel(multimodel_documents) + if multimodal_documents and dataset.is_multimodal: + vector.create_multimodal(multimodal_documents) def clean(self, dataset: Dataset, node_ids: list[str] | None, with_keywords: bool = True, **kwargs): vector = Vector(dataset) diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index c3e02d9ecd..affb33c74a 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -424,13 +424,13 @@ class DatasetService: if not dataset: raise ValueError("Dataset not found") # check if dataset name is exists - - if DatasetService._has_dataset_same_name( - tenant_id=dataset.tenant_id, - dataset_id=dataset_id, - name=data.get("name", dataset.name), - ): - raise ValueError("Dataset name already exists") + if data.get("name") and data.get("name") != dataset.name: + if DatasetService._has_dataset_same_name( + tenant_id=dataset.tenant_id, + dataset_id=dataset_id, + name=data.get("name", dataset.name), + ): + raise ValueError("Dataset name already exists") # Verify user has permission to update this dataset DatasetService.check_dataset_permission(dataset, user) @@ -866,6 +866,10 @@ class DatasetService: model_type=ModelType.TEXT_EMBEDDING, model=knowledge_configuration.embedding_model or "", ) + is_multimodal = DatasetService.check_is_multimodal_model( + current_user.current_tenant_id, knowledge_configuration.embedding_model_provider, knowledge_configuration.embedding_model + ) + dataset.is_multimodal = is_multimodal dataset.embedding_model = embedding_model.model dataset.embedding_model_provider = embedding_model.provider dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding( @@ -902,6 +906,10 @@ class DatasetService: dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding( embedding_model.provider, embedding_model.model ) + is_multimodal = DatasetService.check_is_multimodal_model( + current_user.current_tenant_id, knowledge_configuration.embedding_model_provider, knowledge_configuration.embedding_model + ) + dataset.is_multimodal = is_multimodal dataset.collection_binding_id = dataset_collection_binding.id dataset.indexing_technique = knowledge_configuration.indexing_technique except LLMBadRequestError: @@ -959,6 +967,10 @@ class DatasetService: ) ) dataset.collection_binding_id = dataset_collection_binding.id + is_multimodal = DatasetService.check_is_multimodal_model( + current_user.current_tenant_id, knowledge_configuration.embedding_model_provider, knowledge_configuration.embedding_model + ) + dataset.is_multimodal = is_multimodal except LLMBadRequestError: raise ValueError( "No Embedding Model available. Please configure a valid provider " diff --git a/api/services/vector_service.py b/api/services/vector_service.py index 25b1964ec3..5379eaad7b 100644 --- a/api/services/vector_service.py +++ b/api/services/vector_service.py @@ -8,7 +8,7 @@ from core.rag.index_processor.constant.doc_type import DocType from core.rag.index_processor.constant.index_type import IndexStructureType from core.rag.index_processor.index_processor_base import BaseIndexProcessor from core.rag.index_processor.index_processor_factory import IndexProcessorFactory -from core.rag.models.document import Document +from core.rag.models.document import AttachmentDocument, Document from extensions.ext_database import db from models import UploadFile from models.dataset import ChildChunk, Dataset, DatasetProcessRule, DocumentSegment, SegmentAttachmentBinding @@ -24,7 +24,7 @@ class VectorService: cls, keywords_list: list[list[str]] | None, segments: list[DocumentSegment], dataset: Dataset, doc_form: str ): documents: list[Document] = [] - multimodel_documents: list[Document] = [] + multimodal_documents: list[AttachmentDocument] = [] for segment in segments: if doc_form == IndexStructureType.PARENT_CHILD_INDEX: @@ -80,7 +80,7 @@ class VectorService: documents.append(rag_document) if dataset.is_multimodal: for attachment in segment.attachments: - multimodel_document: Document = Document( + multimodal_document: AttachmentDocument = AttachmentDocument( page_content=attachment["name"], metadata={ "doc_id": attachment["id"], @@ -90,15 +90,15 @@ class VectorService: "doc_type": DocType.IMAGE, }, ) - multimodel_documents.append(multimodel_document) + multimodal_documents.append(multimodal_document) index_processor: BaseIndexProcessor = IndexProcessorFactory(doc_form).init_index_processor() if len(documents) > 0: index_processor.load( - dataset, documents, multimodel_documents=None, with_keywords=True, keywords_list=keywords_list + dataset, documents, None, with_keywords=True, keywords_list=keywords_list ) - if len(multimodel_documents) > 0: - index_processor.load(dataset, multimodel_documents, with_keywords=False) + if len(multimodal_documents) > 0: + index_processor.load(dataset, [], multimodal_documents, with_keywords=False) @classmethod def update_segment_vector(cls, keywords: list[str] | None, segment: DocumentSegment, dataset: Dataset): diff --git a/api/tasks/add_document_to_index_task.py b/api/tasks/add_document_to_index_task.py index a3a291f439..cbef9d3cdb 100644 --- a/api/tasks/add_document_to_index_task.py +++ b/api/tasks/add_document_to_index_task.py @@ -4,9 +4,10 @@ import time import click from celery import shared_task +from core.rag.index_processor.constant.doc_type import DocType from core.rag.index_processor.constant.index_type import IndexStructureType from core.rag.index_processor.index_processor_factory import IndexProcessorFactory -from core.rag.models.document import ChildDocument, Document +from core.rag.models.document import AttachmentDocument, ChildDocument, Document from extensions.ext_database import db from extensions.ext_redis import redis_client from libs.datetime_utils import naive_utc_now @@ -55,6 +56,7 @@ def add_document_to_index_task(dataset_document_id: str): ) documents = [] + multimodal_documents = [] for segment in segments: document = Document( page_content=segment.content, @@ -81,11 +83,23 @@ def add_document_to_index_task(dataset_document_id: str): ) child_documents.append(child_document) document.children = child_documents + if dataset.is_multimodal: + for attachment in segment.attachments: + multimodal_documents.append(AttachmentDocument( + page_content=attachment["name"], + metadata={ + "doc_id": attachment["id"], + "doc_hash": "", + "document_id": segment.document_id, + "dataset_id": segment.dataset_id, + "doc_type": DocType.IMAGE, + }, + )) documents.append(document) index_type = dataset.doc_form index_processor = IndexProcessorFactory(index_type).init_index_processor() - index_processor.load(dataset, documents) + index_processor.load(dataset, documents, multimodal_documents=multimodal_documents) # delete auto disable log db.session.query(DatasetAutoDisableLog).where(DatasetAutoDisableLog.document_id == dataset_document.id).delete() diff --git a/api/tasks/deal_dataset_index_update_task.py b/api/tasks/deal_dataset_index_update_task.py index aba35bd29e..903ebc0b31 100644 --- a/api/tasks/deal_dataset_index_update_task.py +++ b/api/tasks/deal_dataset_index_update_task.py @@ -4,9 +4,10 @@ import time import click from celery import shared_task # type: ignore +from core.rag.index_processor.constant.doc_type import DocType from core.rag.index_processor.constant.index_type import IndexStructureType from core.rag.index_processor.index_processor_factory import IndexProcessorFactory -from core.rag.models.document import ChildDocument, Document +from core.rag.models.document import AttachmentDocument, ChildDocument, Document from extensions.ext_database import db from models.dataset import Dataset, DocumentSegment from models.dataset import Document as DatasetDocument @@ -28,7 +29,7 @@ def deal_dataset_index_update_task(dataset_id: str, action: str): if not dataset: raise Exception("Dataset not found") - index_type = dataset.doc_form or IndexType.PARAGRAPH_INDEX + index_type = dataset.doc_form or IndexStructureType.PARAGRAPH_INDEX index_processor = IndexProcessorFactory(index_type).init_index_processor() if action == "upgrade": dataset_documents = ( @@ -119,6 +120,7 @@ def deal_dataset_index_update_task(dataset_id: str, action: str): ) if segments: documents = [] + multimodal_documents = [] for segment in segments: document = Document( page_content=segment.content, @@ -145,9 +147,21 @@ def deal_dataset_index_update_task(dataset_id: str, action: str): ) child_documents.append(child_document) document.children = child_documents + if dataset.is_multimodal: + for attachment in segment.attachments: + multimodal_documents.append(AttachmentDocument( + page_content=attachment["name"], + metadata={ + "doc_id": attachment["id"], + "doc_hash": "", + "document_id": segment.document_id, + "dataset_id": segment.dataset_id, + "doc_type": DocType.IMAGE, + }, + )) documents.append(document) # save vector index - index_processor.load(dataset, documents, with_keywords=False) + index_processor.load(dataset, documents, multimodal_documents=multimodal_documents, with_keywords=False) db.session.query(DatasetDocument).where(DatasetDocument.id == dataset_document.id).update( {"indexing_status": "completed"}, synchronize_session=False ) diff --git a/api/tasks/deal_dataset_vector_index_task.py b/api/tasks/deal_dataset_vector_index_task.py index 4019755b3f..9d647af56a 100644 --- a/api/tasks/deal_dataset_vector_index_task.py +++ b/api/tasks/deal_dataset_vector_index_task.py @@ -6,9 +6,10 @@ import click from celery import shared_task from sqlalchemy import select +from core.rag.index_processor.constant.doc_type import DocType from core.rag.index_processor.constant.index_type import IndexStructureType from core.rag.index_processor.index_processor_factory import IndexProcessorFactory -from core.rag.models.document import ChildDocument, Document +from core.rag.models.document import AttachmentDocument, ChildDocument, Document from extensions.ext_database import db from models.dataset import Dataset, DocumentSegment from models.dataset import Document as DatasetDocument @@ -32,7 +33,7 @@ def deal_dataset_vector_index_task(dataset_id: str, action: Literal["remove", "a if not dataset: raise Exception("Dataset not found") - index_type = dataset.doc_form or IndexType.PARAGRAPH_INDEX + index_type = dataset.doc_form or IndexStructureType.PARAGRAPH_INDEX index_processor = IndexProcessorFactory(index_type).init_index_processor() if action == "remove": index_processor.clean(dataset, None, with_keywords=False) @@ -119,6 +120,7 @@ def deal_dataset_vector_index_task(dataset_id: str, action: Literal["remove", "a ) if segments: documents = [] + multimodal_documents = [] for segment in segments: document = Document( page_content=segment.content, @@ -145,9 +147,21 @@ def deal_dataset_vector_index_task(dataset_id: str, action: Literal["remove", "a ) child_documents.append(child_document) document.children = child_documents + if dataset.is_multimodal: + for attachment in segment.attachments: + multimodal_documents.append(AttachmentDocument( + page_content=attachment["name"], + metadata={ + "doc_id": attachment["id"], + "doc_hash": "", + "document_id": segment.document_id, + "dataset_id": segment.dataset_id, + "doc_type": DocType.IMAGE, + }, + )) documents.append(document) # save vector index - index_processor.load(dataset, documents, with_keywords=False) + index_processor.load(dataset, documents, multimodal_documents=multimodal_documents, with_keywords=False) db.session.query(DatasetDocument).where(DatasetDocument.id == dataset_document.id).update( {"indexing_status": "completed"}, synchronize_session=False ) diff --git a/api/tasks/delete_segment_from_index_task.py b/api/tasks/delete_segment_from_index_task.py index 22a730dcd2..594e066860 100644 --- a/api/tasks/delete_segment_from_index_task.py +++ b/api/tasks/delete_segment_from_index_task.py @@ -7,6 +7,7 @@ from celery import shared_task from core.rag.index_processor.index_processor_factory import IndexProcessorFactory from extensions.ext_database import db from models.dataset import Dataset, Document, SegmentAttachmentBinding +from models.model import UploadFile logger = logging.getLogger(__name__) @@ -51,10 +52,18 @@ def delete_segment_from_index_task( ) if dataset.is_multimodal: # delete segment attachment binding - db.session.query(SegmentAttachmentBinding).filter( + segment_attachment_bindings = db.session.query(SegmentAttachmentBinding).filter( SegmentAttachmentBinding.segment_id.in_(segment_ids) - ).delete() - db.session.commit() + ).all() + if segment_attachment_bindings: + attachment_ids = [binding.attachment_id for binding in segment_attachment_bindings] + index_processor.clean( + dataset=dataset, node_ids=attachment_ids, with_keywords=False) + for binding in segment_attachment_bindings: + db.session.delete(binding) + # delete upload file + db.session.query(UploadFile).filter(UploadFile.id.in_(attachment_ids)).delete(synchronize_session=False) + db.session.commit() end_at = time.perf_counter() logger.info(click.style(f"Segment deleted from index latency: {end_at - start_at}", fg="green")) diff --git a/api/tasks/disable_segments_from_index_task.py b/api/tasks/disable_segments_from_index_task.py index 9038dc179b..ff013cc40d 100644 --- a/api/tasks/disable_segments_from_index_task.py +++ b/api/tasks/disable_segments_from_index_task.py @@ -8,7 +8,7 @@ from sqlalchemy import select from core.rag.index_processor.index_processor_factory import IndexProcessorFactory from extensions.ext_database import db from extensions.ext_redis import redis_client -from models.dataset import Dataset, DocumentSegment +from models.dataset import Dataset, DocumentSegment, SegmentAttachmentBinding from models.dataset import Document as DatasetDocument logger = logging.getLogger(__name__) @@ -59,6 +59,14 @@ def disable_segments_from_index_task(segment_ids: list, dataset_id: str, documen try: index_node_ids = [segment.index_node_id for segment in segments] + if dataset.is_multimodal: + segment_ids = [segment.id for segment in segments] + segment_attachment_bindings = db.session.query(SegmentAttachmentBinding).filter( + SegmentAttachmentBinding.segment_id.in_(segment_ids) + ).all() + if segment_attachment_bindings: + attachment_ids = [binding.attachment_id for binding in segment_attachment_bindings] + index_node_ids.extend(attachment_ids) index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=False) end_at = time.perf_counter() diff --git a/api/tasks/enable_segment_to_index_task.py b/api/tasks/enable_segment_to_index_task.py index 81c24cba81..e3fec2f80d 100644 --- a/api/tasks/enable_segment_to_index_task.py +++ b/api/tasks/enable_segment_to_index_task.py @@ -4,9 +4,10 @@ import time import click from celery import shared_task +from core.rag.index_processor.constant.doc_type import DocType from core.rag.index_processor.constant.index_type import IndexStructureType from core.rag.index_processor.index_processor_factory import IndexProcessorFactory -from core.rag.models.document import ChildDocument, Document +from core.rag.models.document import AttachmentDocument, ChildDocument, Document from extensions.ext_database import db from extensions.ext_redis import redis_client from libs.datetime_utils import naive_utc_now @@ -83,8 +84,22 @@ def enable_segment_to_index_task(segment_id: str): ) child_documents.append(child_document) document.children = child_documents + multimodel_documents = [] + if dataset.is_multimodal: + for attachment in segment.attachments: + multimodel_documents.append(AttachmentDocument( + page_content=attachment["name"], + metadata={ + "doc_id": attachment["id"], + "doc_hash": "", + "document_id": segment.document_id, + "dataset_id": segment.dataset_id, + "doc_type": DocType.IMAGE, + }, + )) + # save vector index - index_processor.load(dataset, [document]) + index_processor.load(dataset, [document], multimodal_documents=multimodel_documents) end_at = time.perf_counter() logger.info(click.style(f"Segment enabled to index: {segment.id} latency: {end_at - start_at}", fg="green")) diff --git a/api/tasks/enable_segments_to_index_task.py b/api/tasks/enable_segments_to_index_task.py index 6c5853422e..eb8aafe049 100644 --- a/api/tasks/enable_segments_to_index_task.py +++ b/api/tasks/enable_segments_to_index_task.py @@ -5,9 +5,10 @@ import click from celery import shared_task from sqlalchemy import select +from core.rag.index_processor.constant.doc_type import DocType from core.rag.index_processor.constant.index_type import IndexStructureType from core.rag.index_processor.index_processor_factory import IndexProcessorFactory -from core.rag.models.document import ChildDocument, Document +from core.rag.models.document import AttachmentDocument, ChildDocument, Document from extensions.ext_database import db from extensions.ext_redis import redis_client from libs.datetime_utils import naive_utc_now @@ -60,6 +61,7 @@ def enable_segments_to_index_task(segment_ids: list, dataset_id: str, document_i try: documents = [] + multimodal_documents = [] for segment in segments: document = Document( page_content=segment.content, @@ -87,9 +89,22 @@ def enable_segments_to_index_task(segment_ids: list, dataset_id: str, document_i ) child_documents.append(child_document) document.children = child_documents + + if dataset.is_multimodal: + for attachment in segment.attachments: + multimodal_documents.append(AttachmentDocument( + page_content=attachment["name"], + metadata={ + "doc_id": attachment["id"], + "doc_hash": "", + "document_id": segment.document_id, + "dataset_id": segment.dataset_id, + "doc_type": DocType.IMAGE, + }, + )) documents.append(document) # save vector index - index_processor.load(dataset, documents) + index_processor.load(dataset, documents, multimodal_documents=multimodal_documents) end_at = time.perf_counter() logger.info(click.style(f"Segments enabled to index latency: {end_at - start_at}", fg="green")) From 464cf09f2bde6840401051a2383cc4758a429e68 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Wed, 26 Nov 2025 11:36:00 +0800 Subject: [PATCH 6/7] multimodal embedding update segment --- api/core/rag/embedding/cached_embedding.py | 1 + 1 file changed, 1 insertion(+) diff --git a/api/core/rag/embedding/cached_embedding.py b/api/core/rag/embedding/cached_embedding.py index b5d3fb7cb4..3cbc7db75d 100644 --- a/api/core/rag/embedding/cached_embedding.py +++ b/api/core/rag/embedding/cached_embedding.py @@ -171,6 +171,7 @@ class CacheEmbedding(Embeddings): model_name=self._model_instance.model, hash=file_id, provider_name=self._model_instance.provider, + embedding=pickle.dumps(n_embedding, protocol=pickle.HIGHEST_PROTOCOL), ) embedding_cache.set_embedding(n_embedding) db.session.add(embedding_cache) From 687a07bba70803391879ed27ba17cefa58690d66 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Wed, 26 Nov 2025 11:49:37 +0800 Subject: [PATCH 7/7] multimodal embedding update segment --- api/tasks/deal_dataset_vector_index_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/tasks/deal_dataset_vector_index_task.py b/api/tasks/deal_dataset_vector_index_task.py index 9d647af56a..b3d2fbefbb 100644 --- a/api/tasks/deal_dataset_vector_index_task.py +++ b/api/tasks/deal_dataset_vector_index_task.py @@ -18,7 +18,7 @@ logger = logging.getLogger(__name__) @shared_task(queue="dataset") -def deal_dataset_vector_index_task(dataset_id: str, action: Literal["remove", "add", "update"]): +def deal_dataset_vector_index_task(dataset_id: str, action: str): """ Async deal dataset from index :param dataset_id: dataset_id