feat: implement content-based deduplication for document segments

- Add database index on (dataset_id, index_node_hash) for efficient deduplication queries - Add deduplication check in SegmentService.create_segment and multi_create_segment methods - Add deduplication check in DatasetDocumentStore.add_documents method to prevent duplicate embedding processing - Skip creating segments with identical content hashes across the entire dataset This prevents duplicate content from being re-processed and re-embedded when uploading documents with repeated content, improving efficiency and reducing unnecessary compute costs.
fix
2025-12-27 17:47:24 +00:00 · 2025-09-20 06:28:14 +08:00 · 2025-09-20 05:41:25 +08:00 · 2025-09-20 05:30:39 +08:00
6 changed files with 320 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -147,6 +147,7 @@ api/.idea

 api/.env
 api/storage/*
+api/Dockerfile.local

 docker-legacy/volumes/app/storage/*
 docker-legacy/volumes/db/data/*
--- a/api/core/rag/docstore/dataset_docstore.py
+++ b/api/core/rag/docstore/dataset_docstore.py
@@ -93,6 +93,17 @@ class DatasetDocumentStore:

            segment_document = self.get_document_segment(doc_id=doc.metadata["doc_id"])

+            # Check if a segment with the same content hash already exists in the dataset
+            existing_segment_by_hash = db.session.query(DocumentSegment).filter_by(
+                dataset_id=self._dataset.id,
+                index_node_hash=doc.metadata["doc_hash"],
+                enabled=True
+            ).first()
+
+            if existing_segment_by_hash:
+                # Skip creating duplicate segment with same content hash
+                continue
+
            # NOTE: doc could already exist in the store, but we overwrite it
            if not allow_update and segment_document:
                raise ValueError(
--- a/api/models/dataset.py
+++ b/api/models/dataset.py
@@ -689,6 +689,7 @@ class DocumentSegment(Base):
        sa.Index("document_segment_tenant_document_idx", "document_id", "tenant_id"),
        sa.Index("document_segment_node_dataset_idx", "index_node_id", "dataset_id"),
        sa.Index("document_segment_tenant_idx", "tenant_id"),
+        sa.Index("document_segment_dataset_hash_idx", "dataset_id", "index_node_hash"),
    )

    # initial fields
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -2623,6 +2623,17 @@ class SegmentService:
            tokens = embedding_model.get_text_embedding_num_tokens(texts=[content])[0]
        lock_name = f"add_segment_lock_document_id_{document.id}"
        with redis_client.lock(lock_name, timeout=600):
+            # Check if a segment with the same content hash already exists
+            existing_segment = db.session.query(DocumentSegment).filter_by(
+                dataset_id=document.dataset_id,
+                index_node_hash=segment_hash,
+                enabled=True
+            ).first()
+
+            if existing_segment:
+                logger.info(f"Segment with same content hash already exists: {segment_hash}")
+                return existing_segment
+
            max_position = (
                db.session.query(func.max(DocumentSegment.position))
                .where(DocumentSegment.document_id == document.id)
@@ -2689,6 +2700,15 @@ class SegmentService:
                .where(DocumentSegment.document_id == document.id)
                .scalar()
            )
+            # Batch query existing hashes before the loop
+            segment_hashes = [helper.generate_text_hash(seg["content"]) for seg in segments]
+            existing_segments = db.session.query(DocumentSegment.index_node_hash).filter(
+                DocumentSegment.dataset_id == document.dataset_id,
+                DocumentSegment.index_node_hash.in_(segment_hashes),
+                DocumentSegment.enabled == True
+            ).all()
+            existing_hashes = {seg.index_node_hash for seg in existing_segments}
+
            pre_segment_data_list = []
            segment_data_list = []
            keywords_list = []
@@ -2697,6 +2717,12 @@ class SegmentService:
                content = segment_item["content"]
                doc_id = str(uuid.uuid4())
                segment_hash = helper.generate_text_hash(content)
+
+                # Skip existing segments
+                if segment_hash in existing_hashes:
+                    logger.info(f"Skipping duplicate segment with hash: {segment_hash}")
+                    continue
+
                tokens = 0
                if dataset.indexing_technique == "high_quality" and embedding_model:
                    # calc embedding use tokens
--- a/docker/README-local-test.md
+++ b/docker/README-local-test.md
@@ -0,0 +1,218 @@
+# 本地测试环境设置指南
+
+本文档说明如何创建和使用本地的Docker Compose测试环境，该环境不会被提交到版本控制。
+
+## 📁 文件结构
+
+```
+docker/
+├── .env                        # 本地环境配置
+├── docker-compose.override.yaml # 本地覆盖配置
+├── start-local-test.bat         # Windows启动脚本
+└── README-local-test.md         # 本文档
+```
+
+## 🚀 快速开始
+
+### 1. 准备环境配置文件
+
+**使用 `.env`**
+```bash
+cd docker
+copy .env.example .env
+```
+
+**注意**: 请确保 Docker Desktop 正在运行，然后执行启动脚本。
+
+### 2. 修改配置（可选）
+
+编辑你选择的环境文件，调整适合本地测试的配置：
+
+```bash
+# 开发环境
+DEPLOY_ENV=DEVELOPMENT
+
+# 启用调试
+DEBUG=true
+FLASK_DEBUG=true
+LOG_LEVEL=DEBUG
+
+# 数据库配置（保持默认即可）
+DB_USERNAME=postgres
+DB_PASSWORD=difyai123456
+
+# 向量存储（本地测试推荐Weaviate）
+VECTOR_STORE=weaviate
+```
+
+### 3. 启动测试环境
+
+**Windows用户**：
+```cmd
+cd docker
+start-local-test.bat
+```
+
+**脚本会自动**：
+- 检查 Docker Desktop 是否运行
+- 验证 `.env` 配置文件存在
+- 构建 worker 镜像（使用本地 Dockerfile）
+- 启动所有服务
+
+或者手动启动：
+
+```bash
+# 启动中间件（数据库、Redis、向量存储）
+docker compose -f docker-compose.middleware.yaml --profile weaviate up -d
+
+# 启动应用服务
+docker compose up -d
+```
+
+## 🎯 服务说明
+
+### 中间件服务（docker-compose.middleware.yaml）
+- **PostgreSQL**: 主数据库
+- **Redis**: 缓存和消息队列
+- **Weaviate**: 向量数据库（默认）
+- **其他**: 可根据需要启用不同的向量存储
+
+### 应用服务（docker-compose.yaml + override）
+- **API**: 后端服务（开发模式，支持热重载）
+- **Web**: 前端服务（开发模式）
+- **Nginx**: 反向代理
+- **Worker**: 后台任务处理
+
+## 📝 本地开发特性
+
+### 热重载
+- API服务会自动检测代码变化并重启
+- Web服务支持前端热重载
+
+### 数据持久化
+数据存储在 `docker/volumes/` 目录下，会在容器重启后保留。
+
+### 调试支持
+- 启用Flask调试模式
+- 详细的日志输出
+- API文档自动生成
+
+## 🛠️ 常用命令
+
+```bash
+# 查看服务状态
+docker compose ps
+
+# 查看日志
+docker compose logs -f [service_name]
+
+# 重启特定服务
+docker compose restart api
+
+# 进入容器调试
+docker compose exec api bash
+
+# 停止所有服务
+docker compose down
+
+# 停止并清理数据卷
+docker compose -f docker-compose.middleware.yaml down -v
+```
+
+## 🔧 自定义配置
+
+### 修改端口
+在环境文件中修改：
+```bash
+DIFY_PORT=5002  # API端口
+EXPOSE_NGINX_PORT=8080  # Web端口
+```
+
+### 切换向量存储
+在环境文件中修改：
+```bash
+VECTOR_STORE=qdrant  # 或 milvus, chroma 等
+```
+
+然后重新启动中间件：
+```bash
+docker compose -f docker-compose.middleware.yaml --profile qdrant up -d
+```
+
+### 使用本地 Dockerfile
+
+如果需要使用自定义的 Dockerfile（比如使用国内镜像加速）：
+
+1. **创建本地 Dockerfile**：
+   ```bash
+   # 复制原文件
+   cp api/Dockerfile api/Dockerfile.local
+
+   # 编辑本地文件（比如取消阿里云镜像注释）
+   # 第15行取消注释：RUN sed -i 's@deb.debian.org@mirrors.aliyun.com@g' /etc/apt/sources.list.d/debian.sources
+   ```
+
+2. **配置 override 使用本地 Dockerfile**：
+   `docker-compose.override.yaml` 已经配置好了使用 `Dockerfile.local`
+
+3. **构建时会自动使用**：
+   ```bash
+   docker compose --env-file .env build worker
+   ```
+
+### 添加自定义服务
+编辑 `docker-compose.override.yaml` 添加新服务。
+
+## 📚 最佳实践
+
+1. **不要修改官方文件**: 不要直接修改 `docker-compose.yaml`，所有本地改动都放在 `docker-compose.override.yaml` 中。
+
+2. **使用有意义的环境文件**: 使用 `.env` 文件进行本地配置。
+
+3. **定期清理**: 测试完成后清理不需要的数据卷。
+
+4. **版本控制**: 这些本地文件（`.env`, `docker-compose.override.yaml`, `Dockerfile.local`）会被 `.gitignore` 忽略，不会提交到仓库。
+
+## 🐛 故障排除
+
+### 服务启动失败
+```bash
+# 检查端口占用
+netstat -tulpn | grep :5001
+
+# 检查Docker资源
+docker system df
+
+# 查看详细日志
+docker compose logs
+```
+
+### 数据库连接问题
+```bash
+# 检查数据库状态
+docker compose exec db pg_isready
+
+# 重置数据库
+docker compose down
+docker volume rm dify_db_data
+docker compose up -d db
+```
+
+### 内存不足
+减少服务资源使用：
+```yaml
+# 在 docker-compose.override.yaml 中添加
+services:
+  db:
+    environment:
+      POSTGRES_SHARED_BUFFERS: 64MB
+  redis:
+    command: redis-server --maxmemory 64mb
+```
+
+## 📞 获取帮助
+
+如果遇到问题，请：
+1. 检查本文档
+2. 查看 [官方文档](https://docs.dify.ai)
+3. 在GitHub Issues中搜索类似问题
--- a/docker/start-local-test.bat
+++ b/docker/start-local-test.bat
@@ -0,0 +1,63 @@
+@echo off
+chcp 65001 >nul
+REM Dify Local Test Environment Startup Script (Windows)
+REM Used to quickly start local development and testing environment
+
+echo [INFO] Starting Dify local test environment...
+
+REM Ensure in docker directory
+cd /d "%~dp0"
+
+REM Check if Docker is running
+docker info >nul 2>&1
+if errorlevel 1 (
+    echo [ERROR] Docker is not running. Please start Docker Desktop first.
+    pause
+    exit /b 1
+)
+
+REM Check if .env file exists
+if not exist ".env" (
+    echo [ERROR] .env configuration file not found
+    echo         Please create first: copy .env.example .env
+    pause
+    exit /b 1
+)
+
+echo [INFO] Using config file: .env
+
+REM Build worker image
+echo [INFO] Building worker image...
+docker compose --env-file .env build worker
+if errorlevel 1 (
+    echo [ERROR] Failed to build worker image
+    pause
+    exit /b 1
+)
+
+REM Start all services
+echo [INFO] Starting all services...
+docker compose --env-file .env up -d
+if errorlevel 1 (
+    echo [ERROR] Failed to start services
+    pause
+    exit /b 1
+)
+
+echo [SUCCESS] Local test environment started successfully!
+echo.
+echo [SERVICES] Service URLs:
+echo    - Web UI: http://localhost
+echo    - API Docs: http://localhost/swagger-ui.html
+echo    - API Service: http://localhost:5001
+echo.
+echo [COMMANDS] Available commands:
+echo    - View logs: docker compose logs -f
+echo    - Stop services: docker compose down
+echo    - Clean data: docker compose -f docker-compose.middleware.yaml down -v
+echo    - Restart services: docker compose restart
+echo.
+echo [TIP] If first run, wait a few minutes for services to fully start
+echo       Use 'docker compose ps' to check service status
+
+pause