Compare commits

...

3 Commits

Author SHA1 Message Date
Frederick2313072
626e71cb3b feat: implement content-based deduplication for document segments
- Add database index on (dataset_id, index_node_hash) for efficient deduplication queries
- Add deduplication check in SegmentService.create_segment and multi_create_segment methods
- Add deduplication check in DatasetDocumentStore.add_documents method to prevent duplicate embedding processing
- Skip creating segments with identical content hashes across the entire dataset

This prevents duplicate content from being re-processed and re-embedded when uploading documents with repeated content, improving efficiency and reducing unnecessary compute costs.
2025-09-20 06:28:14 +08:00
Frederick2313072
07047487c3 fix 2025-09-20 05:41:25 +08:00
Frederick2313072
c7064d44af feat: local dev 2025-09-20 05:30:39 +08:00
6 changed files with 320 additions and 0 deletions

1
.gitignore vendored
View File

@@ -147,6 +147,7 @@ api/.idea
api/.env
api/storage/*
api/Dockerfile.local
docker-legacy/volumes/app/storage/*
docker-legacy/volumes/db/data/*

View File

@@ -93,6 +93,17 @@ class DatasetDocumentStore:
segment_document = self.get_document_segment(doc_id=doc.metadata["doc_id"])
# Check if a segment with the same content hash already exists in the dataset
existing_segment_by_hash = db.session.query(DocumentSegment).filter_by(
dataset_id=self._dataset.id,
index_node_hash=doc.metadata["doc_hash"],
enabled=True
).first()
if existing_segment_by_hash:
# Skip creating duplicate segment with same content hash
continue
# NOTE: doc could already exist in the store, but we overwrite it
if not allow_update and segment_document:
raise ValueError(

View File

@@ -689,6 +689,7 @@ class DocumentSegment(Base):
sa.Index("document_segment_tenant_document_idx", "document_id", "tenant_id"),
sa.Index("document_segment_node_dataset_idx", "index_node_id", "dataset_id"),
sa.Index("document_segment_tenant_idx", "tenant_id"),
sa.Index("document_segment_dataset_hash_idx", "dataset_id", "index_node_hash"),
)
# initial fields

View File

@@ -2623,6 +2623,17 @@ class SegmentService:
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content])[0]
lock_name = f"add_segment_lock_document_id_{document.id}"
with redis_client.lock(lock_name, timeout=600):
# Check if a segment with the same content hash already exists
existing_segment = db.session.query(DocumentSegment).filter_by(
dataset_id=document.dataset_id,
index_node_hash=segment_hash,
enabled=True
).first()
if existing_segment:
logger.info(f"Segment with same content hash already exists: {segment_hash}")
return existing_segment
max_position = (
db.session.query(func.max(DocumentSegment.position))
.where(DocumentSegment.document_id == document.id)
@@ -2689,6 +2700,15 @@ class SegmentService:
.where(DocumentSegment.document_id == document.id)
.scalar()
)
# Batch query existing hashes before the loop
segment_hashes = [helper.generate_text_hash(seg["content"]) for seg in segments]
existing_segments = db.session.query(DocumentSegment.index_node_hash).filter(
DocumentSegment.dataset_id == document.dataset_id,
DocumentSegment.index_node_hash.in_(segment_hashes),
DocumentSegment.enabled == True
).all()
existing_hashes = {seg.index_node_hash for seg in existing_segments}
pre_segment_data_list = []
segment_data_list = []
keywords_list = []
@@ -2697,6 +2717,12 @@ class SegmentService:
content = segment_item["content"]
doc_id = str(uuid.uuid4())
segment_hash = helper.generate_text_hash(content)
# Skip existing segments
if segment_hash in existing_hashes:
logger.info(f"Skipping duplicate segment with hash: {segment_hash}")
continue
tokens = 0
if dataset.indexing_technique == "high_quality" and embedding_model:
# calc embedding use tokens

218
docker/README-local-test.md Normal file
View File

@@ -0,0 +1,218 @@
# 本地测试环境设置指南
本文档说明如何创建和使用本地的Docker Compose测试环境该环境不会被提交到版本控制。
## 📁 文件结构
```
docker/
├── .env # 本地环境配置
├── docker-compose.override.yaml # 本地覆盖配置
├── start-local-test.bat # Windows启动脚本
└── README-local-test.md # 本文档
```
## 🚀 快速开始
### 1. 准备环境配置文件
**使用 `.env`**
```bash
cd docker
copy .env.example .env
```
**注意**: 请确保 Docker Desktop 正在运行,然后执行启动脚本。
### 2. 修改配置(可选)
编辑你选择的环境文件,调整适合本地测试的配置:
```bash
# 开发环境
DEPLOY_ENV=DEVELOPMENT
# 启用调试
DEBUG=true
FLASK_DEBUG=true
LOG_LEVEL=DEBUG
# 数据库配置(保持默认即可)
DB_USERNAME=postgres
DB_PASSWORD=difyai123456
# 向量存储本地测试推荐Weaviate
VECTOR_STORE=weaviate
```
### 3. 启动测试环境
**Windows用户**
```cmd
cd docker
start-local-test.bat
```
**脚本会自动**
- 检查 Docker Desktop 是否运行
- 验证 `.env` 配置文件存在
- 构建 worker 镜像(使用本地 Dockerfile
- 启动所有服务
或者手动启动:
```bash
# 启动中间件数据库、Redis、向量存储
docker compose -f docker-compose.middleware.yaml --profile weaviate up -d
# 启动应用服务
docker compose up -d
```
## 🎯 服务说明
### 中间件服务docker-compose.middleware.yaml
- **PostgreSQL**: 主数据库
- **Redis**: 缓存和消息队列
- **Weaviate**: 向量数据库(默认)
- **其他**: 可根据需要启用不同的向量存储
### 应用服务docker-compose.yaml + override
- **API**: 后端服务(开发模式,支持热重载)
- **Web**: 前端服务(开发模式)
- **Nginx**: 反向代理
- **Worker**: 后台任务处理
## 📝 本地开发特性
### 热重载
- API服务会自动检测代码变化并重启
- Web服务支持前端热重载
### 数据持久化
数据存储在 `docker/volumes/` 目录下,会在容器重启后保留。
### 调试支持
- 启用Flask调试模式
- 详细的日志输出
- API文档自动生成
## 🛠️ 常用命令
```bash
# 查看服务状态
docker compose ps
# 查看日志
docker compose logs -f [service_name]
# 重启特定服务
docker compose restart api
# 进入容器调试
docker compose exec api bash
# 停止所有服务
docker compose down
# 停止并清理数据卷
docker compose -f docker-compose.middleware.yaml down -v
```
## 🔧 自定义配置
### 修改端口
在环境文件中修改:
```bash
DIFY_PORT=5002 # API端口
EXPOSE_NGINX_PORT=8080 # Web端口
```
### 切换向量存储
在环境文件中修改:
```bash
VECTOR_STORE=qdrant # 或 milvus, chroma 等
```
然后重新启动中间件:
```bash
docker compose -f docker-compose.middleware.yaml --profile qdrant up -d
```
### 使用本地 Dockerfile
如果需要使用自定义的 Dockerfile比如使用国内镜像加速
1. **创建本地 Dockerfile**
```bash
# 复制原文件
cp api/Dockerfile api/Dockerfile.local
# 编辑本地文件(比如取消阿里云镜像注释)
# 第15行取消注释RUN sed -i 's@deb.debian.org@mirrors.aliyun.com@g' /etc/apt/sources.list.d/debian.sources
```
2. **配置 override 使用本地 Dockerfile**
`docker-compose.override.yaml` 已经配置好了使用 `Dockerfile.local`
3. **构建时会自动使用**
```bash
docker compose --env-file .env build worker
```
### 添加自定义服务
编辑 `docker-compose.override.yaml` 添加新服务。
## 📚 最佳实践
1. **不要修改官方文件**: 不要直接修改 `docker-compose.yaml`,所有本地改动都放在 `docker-compose.override.yaml` 中。
2. **使用有意义的环境文件**: 使用 `.env` 文件进行本地配置。
3. **定期清理**: 测试完成后清理不需要的数据卷。
4. **版本控制**: 这些本地文件(`.env`, `docker-compose.override.yaml`, `Dockerfile.local`)会被 `.gitignore` 忽略,不会提交到仓库。
## 🐛 故障排除
### 服务启动失败
```bash
# 检查端口占用
netstat -tulpn | grep :5001
# 检查Docker资源
docker system df
# 查看详细日志
docker compose logs
```
### 数据库连接问题
```bash
# 检查数据库状态
docker compose exec db pg_isready
# 重置数据库
docker compose down
docker volume rm dify_db_data
docker compose up -d db
```
### 内存不足
减少服务资源使用:
```yaml
# 在 docker-compose.override.yaml 中添加
services:
db:
environment:
POSTGRES_SHARED_BUFFERS: 64MB
redis:
command: redis-server --maxmemory 64mb
```
## 📞 获取帮助
如果遇到问题,请:
1. 检查本文档
2. 查看 [官方文档](https://docs.dify.ai)
3. 在GitHub Issues中搜索类似问题

View File

@@ -0,0 +1,63 @@
@echo off
chcp 65001 >nul
REM Dify Local Test Environment Startup Script (Windows)
REM Used to quickly start local development and testing environment
echo [INFO] Starting Dify local test environment...
REM Ensure in docker directory
cd /d "%~dp0"
REM Check if Docker is running
docker info >nul 2>&1
if errorlevel 1 (
echo [ERROR] Docker is not running. Please start Docker Desktop first.
pause
exit /b 1
)
REM Check if .env file exists
if not exist ".env" (
echo [ERROR] .env configuration file not found
echo Please create first: copy .env.example .env
pause
exit /b 1
)
echo [INFO] Using config file: .env
REM Build worker image
echo [INFO] Building worker image...
docker compose --env-file .env build worker
if errorlevel 1 (
echo [ERROR] Failed to build worker image
pause
exit /b 1
)
REM Start all services
echo [INFO] Starting all services...
docker compose --env-file .env up -d
if errorlevel 1 (
echo [ERROR] Failed to start services
pause
exit /b 1
)
echo [SUCCESS] Local test environment started successfully!
echo.
echo [SERVICES] Service URLs:
echo - Web UI: http://localhost
echo - API Docs: http://localhost/swagger-ui.html
echo - API Service: http://localhost:5001
echo.
echo [COMMANDS] Available commands:
echo - View logs: docker compose logs -f
echo - Stop services: docker compose down
echo - Clean data: docker compose -f docker-compose.middleware.yaml down -v
echo - Restart services: docker compose restart
echo.
echo [TIP] If first run, wait a few minutes for services to fully start
echo Use 'docker compose ps' to check service status
pause