Compare commits

...

6 Commits

Author SHA1 Message Date
Yanli 盐粒
60c858aa48 style: normalize import ordering in dataset model 2026-03-01 17:40:28 +08:00
Yanli 盐粒
46a3a2ae09 fix: sort signed URL replacements by position 2026-03-01 17:38:45 +08:00
Yanli 盐粒
30af50cb47 test: avoid MagicMock dependency in segment signing tests 2026-03-01 17:23:18 +08:00
Yanli 盐粒
0d6a4bac0f fix: harden segment file URL signing 2026-03-01 17:12:07 +08:00
autofix-ci[bot]
c693cb9789 [autofix.ci] apply automated fixes 2026-02-20 18:41:55 +00:00
Yanli 盐粒
fc91a7a38b Fix docx segment image URLs 2026-02-09 20:14:52 +08:00
6 changed files with 193 additions and 79 deletions

View File

@@ -114,7 +114,6 @@ class PdfExtractor(BaseExtractor):
"""
image_content = []
upload_files = []
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
try:
image_objects = page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,))
@@ -164,7 +163,7 @@ class PdfExtractor(BaseExtractor):
used_at=naive_utc_now(),
)
upload_files.append(upload_file)
image_content.append(f"![image]({base_url}/files/{upload_file.id}/file-preview)")
image_content.append(f"![image](/files/{upload_file.id}/file-preview)")
except Exception as e:
logger.warning("Failed to extract image from PDF: %s", e)
continue

View File

@@ -87,7 +87,6 @@ class WordExtractor(BaseExtractor):
def _extract_images_from_docx(self, doc):
image_count = 0
image_map = {}
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
for r_id, rel in doc.part.rels.items():
if "image" in rel.target_ref:
@@ -126,7 +125,7 @@ class WordExtractor(BaseExtractor):
used_at=naive_utc_now(),
)
db.session.add(upload_file)
image_map[r_id] = f"![image]({base_url}/files/{upload_file.id}/file-preview)"
image_map[r_id] = f"![image](/files/{upload_file.id}/file-preview)"
else:
image_ext = rel.target_ref.split(".")[-1]
if image_ext is None:
@@ -154,7 +153,7 @@ class WordExtractor(BaseExtractor):
used_at=naive_utc_now(),
)
db.session.add(upload_file)
image_map[rel.target_part] = f"![image]({base_url}/files/{upload_file.id}/file-preview)"
image_map[rel.target_part] = f"![image](/files/{upload_file.id}/file-preview)"
db.session.commit()
return image_map

View File

@@ -10,6 +10,7 @@ import re
import time
from datetime import datetime
from json import JSONDecodeError
from operator import itemgetter
from typing import Any, cast
from uuid import uuid4
@@ -804,41 +805,53 @@ class DocumentSegment(Base):
def sign_content(self) -> str:
return self.get_sign_content()
@staticmethod
def _build_signed_query_params(*, sign_target: str, upload_file_id: str) -> str:
nonce = os.urandom(16).hex()
timestamp = str(int(time.time()))
data_to_sign = f"{sign_target}|{upload_file_id}|{timestamp}|{nonce}"
secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
encoded_sign = base64.urlsafe_b64encode(sign).decode()
return f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
def _get_accessible_upload_file_ids(self, upload_file_ids: set[str]) -> set[str]:
if not upload_file_ids:
return set()
matched_upload_file_ids = db.session.scalars(
select(UploadFile.id).where(
UploadFile.tenant_id == self.tenant_id,
UploadFile.id.in_(list(upload_file_ids)),
)
).all()
return {str(upload_file_id) for upload_file_id in matched_upload_file_ids}
def get_sign_content(self) -> str:
signed_urls: list[tuple[int, int, str]] = []
text = self.content
# For data before v0.10.0
pattern = r"/files/([a-f0-9\-]+)/image-preview(?:\?.*?)?"
matches = re.finditer(pattern, text)
for match in matches:
upload_file_id = match.group(1)
nonce = os.urandom(16).hex()
timestamp = str(int(time.time()))
data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}"
secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
encoded_sign = base64.urlsafe_b64encode(sign).decode()
upload_file_preview_patterns = {
"image-preview": r"(?:https?://[^\s\)\"\']+)?/files/([a-f0-9\-]+)/image-preview(?:\?[^\s\)\"\']*)?",
"file-preview": r"(?:https?://[^\s\)\"\']+)?/files/([a-f0-9\-]+)/file-preview(?:\?[^\s\)\"\']*)?",
}
upload_file_matches: list[tuple[re.Match[str], str, str]] = []
upload_file_ids: set[str] = set()
params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
base_url = f"/files/{upload_file_id}/image-preview"
signed_url = f"{base_url}?{params}"
signed_urls.append((match.start(), match.end(), signed_url))
for preview_type, pattern in upload_file_preview_patterns.items():
for match in re.finditer(pattern, text):
upload_file_id = match.group(1)
upload_file_matches.append((match, preview_type, upload_file_id))
upload_file_ids.add(upload_file_id)
# For data after v0.10.0
pattern = r"/files/([a-f0-9\-]+)/file-preview(?:\?.*?)?"
matches = re.finditer(pattern, text)
for match in matches:
upload_file_id = match.group(1)
nonce = os.urandom(16).hex()
timestamp = str(int(time.time()))
data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
encoded_sign = base64.urlsafe_b64encode(sign).decode()
accessible_upload_file_ids = self._get_accessible_upload_file_ids(upload_file_ids)
params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
base_url = f"/files/{upload_file_id}/file-preview"
for match, preview_type, upload_file_id in upload_file_matches:
if upload_file_id not in accessible_upload_file_ids:
continue
params = self._build_signed_query_params(sign_target=preview_type, upload_file_id=upload_file_id)
base_url = f"/files/{upload_file_id}/{preview_type}"
signed_url = f"{base_url}?{params}"
signed_urls.append((match.start(), match.end(), signed_url))
@@ -849,19 +862,13 @@ class DocumentSegment(Base):
for match in matches:
upload_file_id = match.group(1)
file_extension = match.group(2)
nonce = os.urandom(16).hex()
timestamp = str(int(time.time()))
data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
encoded_sign = base64.urlsafe_b64encode(sign).decode()
params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
params = self._build_signed_query_params(sign_target="file-preview", upload_file_id=upload_file_id)
base_url = f"/files/tools/{upload_file_id}.{file_extension}"
signed_url = f"{base_url}?{params}"
signed_urls.append((match.start(), match.end(), signed_url))
# Reconstruct the text with signed URLs
signed_urls.sort(key=itemgetter(0))
offset = 0
for start, end, signed_url in signed_urls:
text = text[: start + offset] + signed_url + text[end + offset :]

View File

@@ -87,7 +87,7 @@ def test_extract_images_formats(mock_dependencies, monkeypatch, image_bytes, exp
mock_raw.FPDF_PAGEOBJ_IMAGE = 1
result = extractor._extract_images(mock_page)
assert f"![image](http://files.local/files/{file_id}/file-preview)" in result
assert f"![image](/files/{file_id}/file-preview)" in result
assert len(saves) == 1
assert saves[0][1] == image_bytes
assert len(db_stub.session.added) == 1
@@ -180,7 +180,7 @@ def test_extract_images_failures(mock_dependencies):
result = extractor._extract_images(mock_page)
# Should have one success
assert "![image](http://files.local/files/test_file_id/file-preview)" in result
assert "![image](/files/test_file_id/file-preview)" in result
assert len(saves) == 1
assert saves[0][1] == jpeg_bytes
assert db_stub.session.committed is True

View File

@@ -121,8 +121,7 @@ def test_extract_images_from_docx(monkeypatch):
db_stub = SimpleNamespace(session=DummySession())
monkeypatch.setattr(we, "db", db_stub)
# Patch config values used for URL composition and storage type
monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False)
# Patch config value used in this code path
monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False)
# Patch UploadFile to avoid real DB models
@@ -164,7 +163,7 @@ def test_extract_images_from_docx(monkeypatch):
# Returned map should contain entries for external (keyed by rId) and internal (keyed by target_part)
assert set(image_map.keys()) == {"rId1", internal_part}
assert all(v.startswith("![image](") and v.endswith("/file-preview)") for v in image_map.values())
assert all(v.startswith("![image](/files/") and v.endswith("/file-preview)") for v in image_map.values())
# Storage should receive both payloads
payloads = {data for _, data in saves}
@@ -176,39 +175,6 @@ def test_extract_images_from_docx(monkeypatch):
assert db_stub.session.committed is True
def test_extract_images_from_docx_uses_internal_files_url():
"""Test that INTERNAL_FILES_URL takes precedence over FILES_URL for plugin access."""
# Test the URL generation logic directly
from configs import dify_config
# Mock the configuration values
original_files_url = getattr(dify_config, "FILES_URL", None)
original_internal_files_url = getattr(dify_config, "INTERNAL_FILES_URL", None)
try:
# Set both URLs - INTERNAL should take precedence
dify_config.FILES_URL = "http://external.example.com"
dify_config.INTERNAL_FILES_URL = "http://internal.docker:5001"
# Test the URL generation logic (same as in word_extractor.py)
upload_file_id = "test_file_id"
# This is the pattern we fixed in the word extractor
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
generated_url = f"{base_url}/files/{upload_file_id}/file-preview"
# Verify that INTERNAL_FILES_URL is used instead of FILES_URL
assert "http://internal.docker:5001" in generated_url, f"Expected internal URL, got: {generated_url}"
assert "http://external.example.com" not in generated_url, f"Should not use external URL, got: {generated_url}"
finally:
# Restore original values
if original_files_url is not None:
dify_config.FILES_URL = original_files_url
if original_internal_files_url is not None:
dify_config.INTERNAL_FILES_URL = original_internal_files_url
def test_extract_hyperlinks(monkeypatch):
# Mock db and storage to avoid issues during image extraction (even if no images are present)
monkeypatch.setattr(we, "storage", SimpleNamespace(save=lambda k, d: None))

View File

@@ -15,6 +15,7 @@ from datetime import UTC, datetime
from unittest.mock import MagicMock, patch
from uuid import uuid4
import models.dataset as dataset_module
from models.dataset import (
AppDatasetJoin,
ChildChunk,
@@ -489,6 +490,15 @@ class TestDocumentModelRelationships:
class TestDocumentSegmentIndexing:
"""Test suite for DocumentSegment model indexing and operations."""
@staticmethod
def _mock_scalars_result(upload_file_ids: list[str]):
class _ScalarsResult:
@staticmethod
def all() -> list[str]:
return upload_file_ids
return _ScalarsResult()
def test_document_segment_creation_with_required_fields(self):
"""Test creating a document segment with all required fields."""
# Arrange
@@ -547,6 +557,139 @@ class TestDocumentSegmentIndexing:
assert segment.index_node_hash == index_node_hash
assert segment.keywords == keywords
def test_document_segment_sign_content_strips_absolute_files_host(self):
"""Test that sign_content strips scheme/host from absolute /files URLs and returns a signed relative URL."""
# Arrange
upload_file_id = "1602650a-4fe4-423c-85a2-af76c083e3c4"
segment = DocumentSegment(
tenant_id=str(uuid4()),
dataset_id=str(uuid4()),
document_id=str(uuid4()),
position=1,
content=f"![image](http://internal.docker:5001/files/{upload_file_id}/file-preview)",
word_count=1,
tokens=1,
created_by=str(uuid4()),
)
mock_scalars_result = self._mock_scalars_result([upload_file_id])
# Act
with (
patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
patch("models.dataset.time.time", return_value=1700000000),
patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
):
signed = segment.get_sign_content()
# Assert
assert "internal.docker:5001" not in signed
assert f"/files/{upload_file_id}/file-preview?timestamp=" in signed
assert "&nonce=" in signed
assert "&sign=" in signed
def test_document_segment_sign_content_strips_absolute_files_host_for_image_preview(self):
"""Test that sign_content strips scheme/host from absolute image-preview URLs."""
# Arrange
upload_file_id = "e2a4f7b1-1234-5678-9abc-def012345678"
segment = DocumentSegment(
tenant_id=str(uuid4()),
dataset_id=str(uuid4()),
document_id=str(uuid4()),
position=1,
content=f"![image](http://internal.docker:5001/files/{upload_file_id}/image-preview)",
word_count=1,
tokens=1,
created_by=str(uuid4()),
)
mock_scalars_result = self._mock_scalars_result([upload_file_id])
# Act
with (
patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
patch("models.dataset.time.time", return_value=1700000000),
patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
):
signed = segment.get_sign_content()
# Assert
assert "internal.docker:5001" not in signed
assert f"/files/{upload_file_id}/image-preview?timestamp=" in signed
assert "&nonce=" in signed
assert "&sign=" in signed
def test_document_segment_sign_content_skips_upload_files_outside_tenant(self):
"""Test that sign_content only signs upload files belonging to the segment tenant."""
# Arrange
allowed_upload_file_id = "1602650a-4fe4-423c-85a2-af76c083e3c4"
denied_upload_file_id = "f8f35fca-568f-4626-adf0-4f30de96aa32"
segment = DocumentSegment(
tenant_id=str(uuid4()),
dataset_id=str(uuid4()),
document_id=str(uuid4()),
position=1,
content=(
f"allowed: ![image](/files/{allowed_upload_file_id}/file-preview) "
f"denied: ![image](/files/{denied_upload_file_id}/file-preview)"
),
word_count=1,
tokens=1,
created_by=str(uuid4()),
)
mock_scalars_result = self._mock_scalars_result([allowed_upload_file_id])
# Act
with (
patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
patch("models.dataset.time.time", return_value=1700000000),
patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
):
signed = segment.get_sign_content()
# Assert
assert f"/files/{allowed_upload_file_id}/file-preview?timestamp=" in signed
assert f"/files/{denied_upload_file_id}/file-preview?timestamp=" not in signed
assert f"/files/{denied_upload_file_id}/file-preview)" in signed
def test_document_segment_sign_content_handles_mixed_preview_order(self):
"""Test that sign_content preserves content when file-preview appears before image-preview."""
# Arrange
file_preview_id = "1602650a-4fe4-423c-85a2-af76c083e3c4"
image_preview_id = "e2a4f7b1-1234-5678-9abc-def012345678"
segment = DocumentSegment(
tenant_id=str(uuid4()),
dataset_id=str(uuid4()),
document_id=str(uuid4()),
position=1,
content=(
f"file-first: ![file](/files/{file_preview_id}/file-preview) "
f"then-image: ![image](/files/{image_preview_id}/image-preview)"
),
word_count=1,
tokens=1,
created_by=str(uuid4()),
)
mock_scalars_result = self._mock_scalars_result([file_preview_id, image_preview_id])
# Act
with (
patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
patch("models.dataset.time.time", return_value=1700000000),
patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
):
signed = segment.get_sign_content()
# Assert
file_signed = f"/files/{file_preview_id}/file-preview?timestamp="
image_signed = f"/files/{image_preview_id}/image-preview?timestamp="
assert file_signed in signed
assert image_signed in signed
assert signed.index(file_signed) < signed.index(image_signed)
assert signed.count("&sign=") == 2
def test_document_segment_with_answer_field(self):
"""Test creating a document segment with answer field for QA model."""
# Arrange