feat: knowledge pipeline (#25360)

Signed-off-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: twwu <twwu@dify.ai>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
Co-authored-by: jyong <718720800@qq.com>
Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com>
Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com>
Co-authored-by: lyzno1 <yuanyouhuilyz@gmail.com>
Co-authored-by: quicksand <quicksandzn@gmail.com>
Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com>
Co-authored-by: lyzno1 <92089059+lyzno1@users.noreply.github.com>
Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Joel <iamjoel007@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: nite-knite <nkCoding@gmail.com>
Co-authored-by: Hanqing Zhao <sherry9277@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry <xh001x@hotmail.com>
This commit is contained in:
-LAN-
2025-09-18 12:49:10 +08:00
committed by GitHub
parent 7dadb33003
commit 85cda47c70
1772 changed files with 102407 additions and 31710 deletions

View File

@@ -354,6 +354,11 @@ def delete_draft_variables_batch(app_id: str, batch_size: int = 1000) -> int:
"""
Delete draft variables for an app in batches.
This function now handles cleanup of associated Offload data including:
- WorkflowDraftVariableFile records
- UploadFile records
- Object storage files
Args:
app_id: The ID of the app whose draft variables should be deleted
batch_size: Number of records to delete per batch
@@ -365,22 +370,31 @@ def delete_draft_variables_batch(app_id: str, batch_size: int = 1000) -> int:
raise ValueError("batch_size must be positive")
total_deleted = 0
total_files_deleted = 0
while True:
with db.engine.begin() as conn:
# Get a batch of draft variable IDs
# Get a batch of draft variable IDs along with their file_ids
query_sql = """
SELECT id FROM workflow_draft_variables
SELECT id, file_id FROM workflow_draft_variables
WHERE app_id = :app_id
LIMIT :batch_size
"""
result = conn.execute(sa.text(query_sql), {"app_id": app_id, "batch_size": batch_size})
draft_var_ids = [row[0] for row in result]
if not draft_var_ids:
rows = list(result)
if not rows:
break
# Delete the batch
draft_var_ids = [row[0] for row in rows]
file_ids = [row[1] for row in rows if row[1] is not None]
# Clean up associated Offload data first
if file_ids:
files_deleted = _delete_draft_variable_offload_data(conn, file_ids)
total_files_deleted += files_deleted
# Delete the draft variables
delete_sql = """
DELETE FROM workflow_draft_variables
WHERE id IN :ids
@@ -391,11 +405,86 @@ def delete_draft_variables_batch(app_id: str, batch_size: int = 1000) -> int:
logger.info(click.style(f"Deleted {batch_deleted} draft variables (batch) for app {app_id}", fg="green"))
logger.info(click.style(f"Deleted {total_deleted} total draft variables for app {app_id}", fg="green"))
logger.info(
click.style(
f"Deleted {total_deleted} total draft variables for app {app_id}. "
f"Cleaned up {total_files_deleted} total associated files.",
fg="green",
)
)
return total_deleted
def _delete_records(query_sql: str, params: dict, delete_func: Callable, name: str):
def _delete_draft_variable_offload_data(conn, file_ids: list[str]) -> int:
"""
Delete Offload data associated with WorkflowDraftVariable file_ids.
This function:
1. Finds WorkflowDraftVariableFile records by file_ids
2. Deletes associated files from object storage
3. Deletes UploadFile records
4. Deletes WorkflowDraftVariableFile records
Args:
conn: Database connection
file_ids: List of WorkflowDraftVariableFile IDs
Returns:
Number of files cleaned up
"""
from extensions.ext_storage import storage
if not file_ids:
return 0
files_deleted = 0
try:
# Get WorkflowDraftVariableFile records and their associated UploadFile keys
query_sql = """
SELECT wdvf.id, uf.key, uf.id as upload_file_id
FROM workflow_draft_variable_files wdvf
JOIN upload_files uf ON wdvf.upload_file_id = uf.id
WHERE wdvf.id IN :file_ids
"""
result = conn.execute(sa.text(query_sql), {"file_ids": tuple(file_ids)})
file_records = list(result)
# Delete from object storage and collect upload file IDs
upload_file_ids = []
for _, storage_key, upload_file_id in file_records:
try:
storage.delete(storage_key)
upload_file_ids.append(upload_file_id)
files_deleted += 1
except Exception:
logging.exception("Failed to delete storage object %s", storage_key)
# Continue with database cleanup even if storage deletion fails
upload_file_ids.append(upload_file_id)
# Delete UploadFile records
if upload_file_ids:
delete_upload_files_sql = """
DELETE FROM upload_files
WHERE id IN :upload_file_ids
"""
conn.execute(sa.text(delete_upload_files_sql), {"upload_file_ids": tuple(upload_file_ids)})
# Delete WorkflowDraftVariableFile records
delete_variable_files_sql = """
DELETE FROM workflow_draft_variable_files
WHERE id IN :file_ids
"""
conn.execute(sa.text(delete_variable_files_sql), {"file_ids": tuple(file_ids)})
except Exception:
logging.exception("Error deleting draft variable offload data:")
# Don't raise, as we want to continue with the main deletion process
return files_deleted
def _delete_records(query_sql: str, params: dict, delete_func: Callable, name: str) -> None:
while True:
with db.engine.begin() as conn:
rs = conn.execute(sa.text(query_sql), params)