mirror of
https://github.com/langgenius/dify.git
synced 2025-12-28 10:07:24 +00:00
Compare commits
3 Commits
feat/fallb
...
feat/chunk
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
626e71cb3b | ||
|
|
07047487c3 | ||
|
|
c7064d44af |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -147,6 +147,7 @@ api/.idea
|
||||
|
||||
api/.env
|
||||
api/storage/*
|
||||
api/Dockerfile.local
|
||||
|
||||
docker-legacy/volumes/app/storage/*
|
||||
docker-legacy/volumes/db/data/*
|
||||
|
||||
@@ -93,6 +93,17 @@ class DatasetDocumentStore:
|
||||
|
||||
segment_document = self.get_document_segment(doc_id=doc.metadata["doc_id"])
|
||||
|
||||
# Check if a segment with the same content hash already exists in the dataset
|
||||
existing_segment_by_hash = db.session.query(DocumentSegment).filter_by(
|
||||
dataset_id=self._dataset.id,
|
||||
index_node_hash=doc.metadata["doc_hash"],
|
||||
enabled=True
|
||||
).first()
|
||||
|
||||
if existing_segment_by_hash:
|
||||
# Skip creating duplicate segment with same content hash
|
||||
continue
|
||||
|
||||
# NOTE: doc could already exist in the store, but we overwrite it
|
||||
if not allow_update and segment_document:
|
||||
raise ValueError(
|
||||
|
||||
@@ -689,6 +689,7 @@ class DocumentSegment(Base):
|
||||
sa.Index("document_segment_tenant_document_idx", "document_id", "tenant_id"),
|
||||
sa.Index("document_segment_node_dataset_idx", "index_node_id", "dataset_id"),
|
||||
sa.Index("document_segment_tenant_idx", "tenant_id"),
|
||||
sa.Index("document_segment_dataset_hash_idx", "dataset_id", "index_node_hash"),
|
||||
)
|
||||
|
||||
# initial fields
|
||||
|
||||
@@ -2623,6 +2623,17 @@ class SegmentService:
|
||||
tokens = embedding_model.get_text_embedding_num_tokens(texts=[content])[0]
|
||||
lock_name = f"add_segment_lock_document_id_{document.id}"
|
||||
with redis_client.lock(lock_name, timeout=600):
|
||||
# Check if a segment with the same content hash already exists
|
||||
existing_segment = db.session.query(DocumentSegment).filter_by(
|
||||
dataset_id=document.dataset_id,
|
||||
index_node_hash=segment_hash,
|
||||
enabled=True
|
||||
).first()
|
||||
|
||||
if existing_segment:
|
||||
logger.info(f"Segment with same content hash already exists: {segment_hash}")
|
||||
return existing_segment
|
||||
|
||||
max_position = (
|
||||
db.session.query(func.max(DocumentSegment.position))
|
||||
.where(DocumentSegment.document_id == document.id)
|
||||
@@ -2689,6 +2700,15 @@ class SegmentService:
|
||||
.where(DocumentSegment.document_id == document.id)
|
||||
.scalar()
|
||||
)
|
||||
# Batch query existing hashes before the loop
|
||||
segment_hashes = [helper.generate_text_hash(seg["content"]) for seg in segments]
|
||||
existing_segments = db.session.query(DocumentSegment.index_node_hash).filter(
|
||||
DocumentSegment.dataset_id == document.dataset_id,
|
||||
DocumentSegment.index_node_hash.in_(segment_hashes),
|
||||
DocumentSegment.enabled == True
|
||||
).all()
|
||||
existing_hashes = {seg.index_node_hash for seg in existing_segments}
|
||||
|
||||
pre_segment_data_list = []
|
||||
segment_data_list = []
|
||||
keywords_list = []
|
||||
@@ -2697,6 +2717,12 @@ class SegmentService:
|
||||
content = segment_item["content"]
|
||||
doc_id = str(uuid.uuid4())
|
||||
segment_hash = helper.generate_text_hash(content)
|
||||
|
||||
# Skip existing segments
|
||||
if segment_hash in existing_hashes:
|
||||
logger.info(f"Skipping duplicate segment with hash: {segment_hash}")
|
||||
continue
|
||||
|
||||
tokens = 0
|
||||
if dataset.indexing_technique == "high_quality" and embedding_model:
|
||||
# calc embedding use tokens
|
||||
|
||||
218
docker/README-local-test.md
Normal file
218
docker/README-local-test.md
Normal file
@@ -0,0 +1,218 @@
|
||||
# 本地测试环境设置指南
|
||||
|
||||
本文档说明如何创建和使用本地的Docker Compose测试环境,该环境不会被提交到版本控制。
|
||||
|
||||
## 📁 文件结构
|
||||
|
||||
```
|
||||
docker/
|
||||
├── .env # 本地环境配置
|
||||
├── docker-compose.override.yaml # 本地覆盖配置
|
||||
├── start-local-test.bat # Windows启动脚本
|
||||
└── README-local-test.md # 本文档
|
||||
```
|
||||
|
||||
## 🚀 快速开始
|
||||
|
||||
### 1. 准备环境配置文件
|
||||
|
||||
**使用 `.env`**
|
||||
```bash
|
||||
cd docker
|
||||
copy .env.example .env
|
||||
```
|
||||
|
||||
**注意**: 请确保 Docker Desktop 正在运行,然后执行启动脚本。
|
||||
|
||||
### 2. 修改配置(可选)
|
||||
|
||||
编辑你选择的环境文件,调整适合本地测试的配置:
|
||||
|
||||
```bash
|
||||
# 开发环境
|
||||
DEPLOY_ENV=DEVELOPMENT
|
||||
|
||||
# 启用调试
|
||||
DEBUG=true
|
||||
FLASK_DEBUG=true
|
||||
LOG_LEVEL=DEBUG
|
||||
|
||||
# 数据库配置(保持默认即可)
|
||||
DB_USERNAME=postgres
|
||||
DB_PASSWORD=difyai123456
|
||||
|
||||
# 向量存储(本地测试推荐Weaviate)
|
||||
VECTOR_STORE=weaviate
|
||||
```
|
||||
|
||||
### 3. 启动测试环境
|
||||
|
||||
**Windows用户**:
|
||||
```cmd
|
||||
cd docker
|
||||
start-local-test.bat
|
||||
```
|
||||
|
||||
**脚本会自动**:
|
||||
- 检查 Docker Desktop 是否运行
|
||||
- 验证 `.env` 配置文件存在
|
||||
- 构建 worker 镜像(使用本地 Dockerfile)
|
||||
- 启动所有服务
|
||||
|
||||
或者手动启动:
|
||||
|
||||
```bash
|
||||
# 启动中间件(数据库、Redis、向量存储)
|
||||
docker compose -f docker-compose.middleware.yaml --profile weaviate up -d
|
||||
|
||||
# 启动应用服务
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
## 🎯 服务说明
|
||||
|
||||
### 中间件服务(docker-compose.middleware.yaml)
|
||||
- **PostgreSQL**: 主数据库
|
||||
- **Redis**: 缓存和消息队列
|
||||
- **Weaviate**: 向量数据库(默认)
|
||||
- **其他**: 可根据需要启用不同的向量存储
|
||||
|
||||
### 应用服务(docker-compose.yaml + override)
|
||||
- **API**: 后端服务(开发模式,支持热重载)
|
||||
- **Web**: 前端服务(开发模式)
|
||||
- **Nginx**: 反向代理
|
||||
- **Worker**: 后台任务处理
|
||||
|
||||
## 📝 本地开发特性
|
||||
|
||||
### 热重载
|
||||
- API服务会自动检测代码变化并重启
|
||||
- Web服务支持前端热重载
|
||||
|
||||
### 数据持久化
|
||||
数据存储在 `docker/volumes/` 目录下,会在容器重启后保留。
|
||||
|
||||
### 调试支持
|
||||
- 启用Flask调试模式
|
||||
- 详细的日志输出
|
||||
- API文档自动生成
|
||||
|
||||
## 🛠️ 常用命令
|
||||
|
||||
```bash
|
||||
# 查看服务状态
|
||||
docker compose ps
|
||||
|
||||
# 查看日志
|
||||
docker compose logs -f [service_name]
|
||||
|
||||
# 重启特定服务
|
||||
docker compose restart api
|
||||
|
||||
# 进入容器调试
|
||||
docker compose exec api bash
|
||||
|
||||
# 停止所有服务
|
||||
docker compose down
|
||||
|
||||
# 停止并清理数据卷
|
||||
docker compose -f docker-compose.middleware.yaml down -v
|
||||
```
|
||||
|
||||
## 🔧 自定义配置
|
||||
|
||||
### 修改端口
|
||||
在环境文件中修改:
|
||||
```bash
|
||||
DIFY_PORT=5002 # API端口
|
||||
EXPOSE_NGINX_PORT=8080 # Web端口
|
||||
```
|
||||
|
||||
### 切换向量存储
|
||||
在环境文件中修改:
|
||||
```bash
|
||||
VECTOR_STORE=qdrant # 或 milvus, chroma 等
|
||||
```
|
||||
|
||||
然后重新启动中间件:
|
||||
```bash
|
||||
docker compose -f docker-compose.middleware.yaml --profile qdrant up -d
|
||||
```
|
||||
|
||||
### 使用本地 Dockerfile
|
||||
|
||||
如果需要使用自定义的 Dockerfile(比如使用国内镜像加速):
|
||||
|
||||
1. **创建本地 Dockerfile**:
|
||||
```bash
|
||||
# 复制原文件
|
||||
cp api/Dockerfile api/Dockerfile.local
|
||||
|
||||
# 编辑本地文件(比如取消阿里云镜像注释)
|
||||
# 第15行取消注释:RUN sed -i 's@deb.debian.org@mirrors.aliyun.com@g' /etc/apt/sources.list.d/debian.sources
|
||||
```
|
||||
|
||||
2. **配置 override 使用本地 Dockerfile**:
|
||||
`docker-compose.override.yaml` 已经配置好了使用 `Dockerfile.local`
|
||||
|
||||
3. **构建时会自动使用**:
|
||||
```bash
|
||||
docker compose --env-file .env build worker
|
||||
```
|
||||
|
||||
### 添加自定义服务
|
||||
编辑 `docker-compose.override.yaml` 添加新服务。
|
||||
|
||||
## 📚 最佳实践
|
||||
|
||||
1. **不要修改官方文件**: 不要直接修改 `docker-compose.yaml`,所有本地改动都放在 `docker-compose.override.yaml` 中。
|
||||
|
||||
2. **使用有意义的环境文件**: 使用 `.env` 文件进行本地配置。
|
||||
|
||||
3. **定期清理**: 测试完成后清理不需要的数据卷。
|
||||
|
||||
4. **版本控制**: 这些本地文件(`.env`, `docker-compose.override.yaml`, `Dockerfile.local`)会被 `.gitignore` 忽略,不会提交到仓库。
|
||||
|
||||
## 🐛 故障排除
|
||||
|
||||
### 服务启动失败
|
||||
```bash
|
||||
# 检查端口占用
|
||||
netstat -tulpn | grep :5001
|
||||
|
||||
# 检查Docker资源
|
||||
docker system df
|
||||
|
||||
# 查看详细日志
|
||||
docker compose logs
|
||||
```
|
||||
|
||||
### 数据库连接问题
|
||||
```bash
|
||||
# 检查数据库状态
|
||||
docker compose exec db pg_isready
|
||||
|
||||
# 重置数据库
|
||||
docker compose down
|
||||
docker volume rm dify_db_data
|
||||
docker compose up -d db
|
||||
```
|
||||
|
||||
### 内存不足
|
||||
减少服务资源使用:
|
||||
```yaml
|
||||
# 在 docker-compose.override.yaml 中添加
|
||||
services:
|
||||
db:
|
||||
environment:
|
||||
POSTGRES_SHARED_BUFFERS: 64MB
|
||||
redis:
|
||||
command: redis-server --maxmemory 64mb
|
||||
```
|
||||
|
||||
## 📞 获取帮助
|
||||
|
||||
如果遇到问题,请:
|
||||
1. 检查本文档
|
||||
2. 查看 [官方文档](https://docs.dify.ai)
|
||||
3. 在GitHub Issues中搜索类似问题
|
||||
63
docker/start-local-test.bat
Normal file
63
docker/start-local-test.bat
Normal file
@@ -0,0 +1,63 @@
|
||||
@echo off
|
||||
chcp 65001 >nul
|
||||
REM Dify Local Test Environment Startup Script (Windows)
|
||||
REM Used to quickly start local development and testing environment
|
||||
|
||||
echo [INFO] Starting Dify local test environment...
|
||||
|
||||
REM Ensure in docker directory
|
||||
cd /d "%~dp0"
|
||||
|
||||
REM Check if Docker is running
|
||||
docker info >nul 2>&1
|
||||
if errorlevel 1 (
|
||||
echo [ERROR] Docker is not running. Please start Docker Desktop first.
|
||||
pause
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
REM Check if .env file exists
|
||||
if not exist ".env" (
|
||||
echo [ERROR] .env configuration file not found
|
||||
echo Please create first: copy .env.example .env
|
||||
pause
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
echo [INFO] Using config file: .env
|
||||
|
||||
REM Build worker image
|
||||
echo [INFO] Building worker image...
|
||||
docker compose --env-file .env build worker
|
||||
if errorlevel 1 (
|
||||
echo [ERROR] Failed to build worker image
|
||||
pause
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
REM Start all services
|
||||
echo [INFO] Starting all services...
|
||||
docker compose --env-file .env up -d
|
||||
if errorlevel 1 (
|
||||
echo [ERROR] Failed to start services
|
||||
pause
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
echo [SUCCESS] Local test environment started successfully!
|
||||
echo.
|
||||
echo [SERVICES] Service URLs:
|
||||
echo - Web UI: http://localhost
|
||||
echo - API Docs: http://localhost/swagger-ui.html
|
||||
echo - API Service: http://localhost:5001
|
||||
echo.
|
||||
echo [COMMANDS] Available commands:
|
||||
echo - View logs: docker compose logs -f
|
||||
echo - Stop services: docker compose down
|
||||
echo - Clean data: docker compose -f docker-compose.middleware.yaml down -v
|
||||
echo - Restart services: docker compose restart
|
||||
echo.
|
||||
echo [TIP] If first run, wait a few minutes for services to fully start
|
||||
echo Use 'docker compose ps' to check service status
|
||||
|
||||
pause
|
||||
Reference in New Issue
Block a user