Compare commits

...

6 Commits

Author SHA1 Message Date
Yanli 盐粒
60c858aa48 style: normalize import ordering in dataset model 2026-03-01 17:40:28 +08:00
Yanli 盐粒
46a3a2ae09 fix: sort signed URL replacements by position 2026-03-01 17:38:45 +08:00
Yanli 盐粒
30af50cb47 test: avoid MagicMock dependency in segment signing tests 2026-03-01 17:23:18 +08:00
Yanli 盐粒
0d6a4bac0f fix: harden segment file URL signing 2026-03-01 17:12:07 +08:00
autofix-ci[bot]
c693cb9789 [autofix.ci] apply automated fixes 2026-02-20 18:41:55 +00:00
Yanli 盐粒
fc91a7a38b Fix docx segment image URLs 2026-02-09 20:14:52 +08:00
6 changed files with 193 additions and 79 deletions

View File

@@ -114,7 +114,6 @@ class PdfExtractor(BaseExtractor):
""" """
image_content = [] image_content = []
upload_files = [] upload_files = []
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
try: try:
image_objects = page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,)) image_objects = page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,))
@@ -164,7 +163,7 @@ class PdfExtractor(BaseExtractor):
used_at=naive_utc_now(), used_at=naive_utc_now(),
) )
upload_files.append(upload_file) upload_files.append(upload_file)
image_content.append(f"![image]({base_url}/files/{upload_file.id}/file-preview)") image_content.append(f"![image](/files/{upload_file.id}/file-preview)")
except Exception as e: except Exception as e:
logger.warning("Failed to extract image from PDF: %s", e) logger.warning("Failed to extract image from PDF: %s", e)
continue continue

View File

@@ -87,7 +87,6 @@ class WordExtractor(BaseExtractor):
def _extract_images_from_docx(self, doc): def _extract_images_from_docx(self, doc):
image_count = 0 image_count = 0
image_map = {} image_map = {}
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
for r_id, rel in doc.part.rels.items(): for r_id, rel in doc.part.rels.items():
if "image" in rel.target_ref: if "image" in rel.target_ref:
@@ -126,7 +125,7 @@ class WordExtractor(BaseExtractor):
used_at=naive_utc_now(), used_at=naive_utc_now(),
) )
db.session.add(upload_file) db.session.add(upload_file)
image_map[r_id] = f"![image]({base_url}/files/{upload_file.id}/file-preview)" image_map[r_id] = f"![image](/files/{upload_file.id}/file-preview)"
else: else:
image_ext = rel.target_ref.split(".")[-1] image_ext = rel.target_ref.split(".")[-1]
if image_ext is None: if image_ext is None:
@@ -154,7 +153,7 @@ class WordExtractor(BaseExtractor):
used_at=naive_utc_now(), used_at=naive_utc_now(),
) )
db.session.add(upload_file) db.session.add(upload_file)
image_map[rel.target_part] = f"![image]({base_url}/files/{upload_file.id}/file-preview)" image_map[rel.target_part] = f"![image](/files/{upload_file.id}/file-preview)"
db.session.commit() db.session.commit()
return image_map return image_map

View File

@@ -10,6 +10,7 @@ import re
import time import time
from datetime import datetime from datetime import datetime
from json import JSONDecodeError from json import JSONDecodeError
from operator import itemgetter
from typing import Any, cast from typing import Any, cast
from uuid import uuid4 from uuid import uuid4
@@ -804,41 +805,53 @@ class DocumentSegment(Base):
def sign_content(self) -> str: def sign_content(self) -> str:
return self.get_sign_content() return self.get_sign_content()
@staticmethod
def _build_signed_query_params(*, sign_target: str, upload_file_id: str) -> str:
nonce = os.urandom(16).hex()
timestamp = str(int(time.time()))
data_to_sign = f"{sign_target}|{upload_file_id}|{timestamp}|{nonce}"
secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
encoded_sign = base64.urlsafe_b64encode(sign).decode()
return f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
def _get_accessible_upload_file_ids(self, upload_file_ids: set[str]) -> set[str]:
if not upload_file_ids:
return set()
matched_upload_file_ids = db.session.scalars(
select(UploadFile.id).where(
UploadFile.tenant_id == self.tenant_id,
UploadFile.id.in_(list(upload_file_ids)),
)
).all()
return {str(upload_file_id) for upload_file_id in matched_upload_file_ids}
def get_sign_content(self) -> str: def get_sign_content(self) -> str:
signed_urls: list[tuple[int, int, str]] = [] signed_urls: list[tuple[int, int, str]] = []
text = self.content text = self.content
# For data before v0.10.0 upload_file_preview_patterns = {
pattern = r"/files/([a-f0-9\-]+)/image-preview(?:\?.*?)?" "image-preview": r"(?:https?://[^\s\)\"\']+)?/files/([a-f0-9\-]+)/image-preview(?:\?[^\s\)\"\']*)?",
matches = re.finditer(pattern, text) "file-preview": r"(?:https?://[^\s\)\"\']+)?/files/([a-f0-9\-]+)/file-preview(?:\?[^\s\)\"\']*)?",
for match in matches: }
upload_file_id = match.group(1) upload_file_matches: list[tuple[re.Match[str], str, str]] = []
nonce = os.urandom(16).hex() upload_file_ids: set[str] = set()
timestamp = str(int(time.time()))
data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}"
secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
encoded_sign = base64.urlsafe_b64encode(sign).decode()
params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}" for preview_type, pattern in upload_file_preview_patterns.items():
base_url = f"/files/{upload_file_id}/image-preview" for match in re.finditer(pattern, text):
signed_url = f"{base_url}?{params}" upload_file_id = match.group(1)
signed_urls.append((match.start(), match.end(), signed_url)) upload_file_matches.append((match, preview_type, upload_file_id))
upload_file_ids.add(upload_file_id)
# For data after v0.10.0 accessible_upload_file_ids = self._get_accessible_upload_file_ids(upload_file_ids)
pattern = r"/files/([a-f0-9\-]+)/file-preview(?:\?.*?)?"
matches = re.finditer(pattern, text)
for match in matches:
upload_file_id = match.group(1)
nonce = os.urandom(16).hex()
timestamp = str(int(time.time()))
data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
encoded_sign = base64.urlsafe_b64encode(sign).decode()
params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}" for match, preview_type, upload_file_id in upload_file_matches:
base_url = f"/files/{upload_file_id}/file-preview" if upload_file_id not in accessible_upload_file_ids:
continue
params = self._build_signed_query_params(sign_target=preview_type, upload_file_id=upload_file_id)
base_url = f"/files/{upload_file_id}/{preview_type}"
signed_url = f"{base_url}?{params}" signed_url = f"{base_url}?{params}"
signed_urls.append((match.start(), match.end(), signed_url)) signed_urls.append((match.start(), match.end(), signed_url))
@@ -849,19 +862,13 @@ class DocumentSegment(Base):
for match in matches: for match in matches:
upload_file_id = match.group(1) upload_file_id = match.group(1)
file_extension = match.group(2) file_extension = match.group(2)
nonce = os.urandom(16).hex() params = self._build_signed_query_params(sign_target="file-preview", upload_file_id=upload_file_id)
timestamp = str(int(time.time()))
data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
encoded_sign = base64.urlsafe_b64encode(sign).decode()
params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
base_url = f"/files/tools/{upload_file_id}.{file_extension}" base_url = f"/files/tools/{upload_file_id}.{file_extension}"
signed_url = f"{base_url}?{params}" signed_url = f"{base_url}?{params}"
signed_urls.append((match.start(), match.end(), signed_url)) signed_urls.append((match.start(), match.end(), signed_url))
# Reconstruct the text with signed URLs # Reconstruct the text with signed URLs
signed_urls.sort(key=itemgetter(0))
offset = 0 offset = 0
for start, end, signed_url in signed_urls: for start, end, signed_url in signed_urls:
text = text[: start + offset] + signed_url + text[end + offset :] text = text[: start + offset] + signed_url + text[end + offset :]

View File

@@ -87,7 +87,7 @@ def test_extract_images_formats(mock_dependencies, monkeypatch, image_bytes, exp
mock_raw.FPDF_PAGEOBJ_IMAGE = 1 mock_raw.FPDF_PAGEOBJ_IMAGE = 1
result = extractor._extract_images(mock_page) result = extractor._extract_images(mock_page)
assert f"![image](http://files.local/files/{file_id}/file-preview)" in result assert f"![image](/files/{file_id}/file-preview)" in result
assert len(saves) == 1 assert len(saves) == 1
assert saves[0][1] == image_bytes assert saves[0][1] == image_bytes
assert len(db_stub.session.added) == 1 assert len(db_stub.session.added) == 1
@@ -180,7 +180,7 @@ def test_extract_images_failures(mock_dependencies):
result = extractor._extract_images(mock_page) result = extractor._extract_images(mock_page)
# Should have one success # Should have one success
assert "![image](http://files.local/files/test_file_id/file-preview)" in result assert "![image](/files/test_file_id/file-preview)" in result
assert len(saves) == 1 assert len(saves) == 1
assert saves[0][1] == jpeg_bytes assert saves[0][1] == jpeg_bytes
assert db_stub.session.committed is True assert db_stub.session.committed is True

View File

@@ -121,8 +121,7 @@ def test_extract_images_from_docx(monkeypatch):
db_stub = SimpleNamespace(session=DummySession()) db_stub = SimpleNamespace(session=DummySession())
monkeypatch.setattr(we, "db", db_stub) monkeypatch.setattr(we, "db", db_stub)
# Patch config values used for URL composition and storage type # Patch config value used in this code path
monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False)
monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False) monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False)
# Patch UploadFile to avoid real DB models # Patch UploadFile to avoid real DB models
@@ -164,7 +163,7 @@ def test_extract_images_from_docx(monkeypatch):
# Returned map should contain entries for external (keyed by rId) and internal (keyed by target_part) # Returned map should contain entries for external (keyed by rId) and internal (keyed by target_part)
assert set(image_map.keys()) == {"rId1", internal_part} assert set(image_map.keys()) == {"rId1", internal_part}
assert all(v.startswith("![image](") and v.endswith("/file-preview)") for v in image_map.values()) assert all(v.startswith("![image](/files/") and v.endswith("/file-preview)") for v in image_map.values())
# Storage should receive both payloads # Storage should receive both payloads
payloads = {data for _, data in saves} payloads = {data for _, data in saves}
@@ -176,39 +175,6 @@ def test_extract_images_from_docx(monkeypatch):
assert db_stub.session.committed is True assert db_stub.session.committed is True
def test_extract_images_from_docx_uses_internal_files_url():
"""Test that INTERNAL_FILES_URL takes precedence over FILES_URL for plugin access."""
# Test the URL generation logic directly
from configs import dify_config
# Mock the configuration values
original_files_url = getattr(dify_config, "FILES_URL", None)
original_internal_files_url = getattr(dify_config, "INTERNAL_FILES_URL", None)
try:
# Set both URLs - INTERNAL should take precedence
dify_config.FILES_URL = "http://external.example.com"
dify_config.INTERNAL_FILES_URL = "http://internal.docker:5001"
# Test the URL generation logic (same as in word_extractor.py)
upload_file_id = "test_file_id"
# This is the pattern we fixed in the word extractor
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
generated_url = f"{base_url}/files/{upload_file_id}/file-preview"
# Verify that INTERNAL_FILES_URL is used instead of FILES_URL
assert "http://internal.docker:5001" in generated_url, f"Expected internal URL, got: {generated_url}"
assert "http://external.example.com" not in generated_url, f"Should not use external URL, got: {generated_url}"
finally:
# Restore original values
if original_files_url is not None:
dify_config.FILES_URL = original_files_url
if original_internal_files_url is not None:
dify_config.INTERNAL_FILES_URL = original_internal_files_url
def test_extract_hyperlinks(monkeypatch): def test_extract_hyperlinks(monkeypatch):
# Mock db and storage to avoid issues during image extraction (even if no images are present) # Mock db and storage to avoid issues during image extraction (even if no images are present)
monkeypatch.setattr(we, "storage", SimpleNamespace(save=lambda k, d: None)) monkeypatch.setattr(we, "storage", SimpleNamespace(save=lambda k, d: None))

View File

@@ -15,6 +15,7 @@ from datetime import UTC, datetime
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from uuid import uuid4 from uuid import uuid4
import models.dataset as dataset_module
from models.dataset import ( from models.dataset import (
AppDatasetJoin, AppDatasetJoin,
ChildChunk, ChildChunk,
@@ -489,6 +490,15 @@ class TestDocumentModelRelationships:
class TestDocumentSegmentIndexing: class TestDocumentSegmentIndexing:
"""Test suite for DocumentSegment model indexing and operations.""" """Test suite for DocumentSegment model indexing and operations."""
@staticmethod
def _mock_scalars_result(upload_file_ids: list[str]):
class _ScalarsResult:
@staticmethod
def all() -> list[str]:
return upload_file_ids
return _ScalarsResult()
def test_document_segment_creation_with_required_fields(self): def test_document_segment_creation_with_required_fields(self):
"""Test creating a document segment with all required fields.""" """Test creating a document segment with all required fields."""
# Arrange # Arrange
@@ -547,6 +557,139 @@ class TestDocumentSegmentIndexing:
assert segment.index_node_hash == index_node_hash assert segment.index_node_hash == index_node_hash
assert segment.keywords == keywords assert segment.keywords == keywords
def test_document_segment_sign_content_strips_absolute_files_host(self):
"""Test that sign_content strips scheme/host from absolute /files URLs and returns a signed relative URL."""
# Arrange
upload_file_id = "1602650a-4fe4-423c-85a2-af76c083e3c4"
segment = DocumentSegment(
tenant_id=str(uuid4()),
dataset_id=str(uuid4()),
document_id=str(uuid4()),
position=1,
content=f"![image](http://internal.docker:5001/files/{upload_file_id}/file-preview)",
word_count=1,
tokens=1,
created_by=str(uuid4()),
)
mock_scalars_result = self._mock_scalars_result([upload_file_id])
# Act
with (
patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
patch("models.dataset.time.time", return_value=1700000000),
patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
):
signed = segment.get_sign_content()
# Assert
assert "internal.docker:5001" not in signed
assert f"/files/{upload_file_id}/file-preview?timestamp=" in signed
assert "&nonce=" in signed
assert "&sign=" in signed
def test_document_segment_sign_content_strips_absolute_files_host_for_image_preview(self):
"""Test that sign_content strips scheme/host from absolute image-preview URLs."""
# Arrange
upload_file_id = "e2a4f7b1-1234-5678-9abc-def012345678"
segment = DocumentSegment(
tenant_id=str(uuid4()),
dataset_id=str(uuid4()),
document_id=str(uuid4()),
position=1,
content=f"![image](http://internal.docker:5001/files/{upload_file_id}/image-preview)",
word_count=1,
tokens=1,
created_by=str(uuid4()),
)
mock_scalars_result = self._mock_scalars_result([upload_file_id])
# Act
with (
patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
patch("models.dataset.time.time", return_value=1700000000),
patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
):
signed = segment.get_sign_content()
# Assert
assert "internal.docker:5001" not in signed
assert f"/files/{upload_file_id}/image-preview?timestamp=" in signed
assert "&nonce=" in signed
assert "&sign=" in signed
def test_document_segment_sign_content_skips_upload_files_outside_tenant(self):
"""Test that sign_content only signs upload files belonging to the segment tenant."""
# Arrange
allowed_upload_file_id = "1602650a-4fe4-423c-85a2-af76c083e3c4"
denied_upload_file_id = "f8f35fca-568f-4626-adf0-4f30de96aa32"
segment = DocumentSegment(
tenant_id=str(uuid4()),
dataset_id=str(uuid4()),
document_id=str(uuid4()),
position=1,
content=(
f"allowed: ![image](/files/{allowed_upload_file_id}/file-preview) "
f"denied: ![image](/files/{denied_upload_file_id}/file-preview)"
),
word_count=1,
tokens=1,
created_by=str(uuid4()),
)
mock_scalars_result = self._mock_scalars_result([allowed_upload_file_id])
# Act
with (
patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
patch("models.dataset.time.time", return_value=1700000000),
patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
):
signed = segment.get_sign_content()
# Assert
assert f"/files/{allowed_upload_file_id}/file-preview?timestamp=" in signed
assert f"/files/{denied_upload_file_id}/file-preview?timestamp=" not in signed
assert f"/files/{denied_upload_file_id}/file-preview)" in signed
def test_document_segment_sign_content_handles_mixed_preview_order(self):
"""Test that sign_content preserves content when file-preview appears before image-preview."""
# Arrange
file_preview_id = "1602650a-4fe4-423c-85a2-af76c083e3c4"
image_preview_id = "e2a4f7b1-1234-5678-9abc-def012345678"
segment = DocumentSegment(
tenant_id=str(uuid4()),
dataset_id=str(uuid4()),
document_id=str(uuid4()),
position=1,
content=(
f"file-first: ![file](/files/{file_preview_id}/file-preview) "
f"then-image: ![image](/files/{image_preview_id}/image-preview)"
),
word_count=1,
tokens=1,
created_by=str(uuid4()),
)
mock_scalars_result = self._mock_scalars_result([file_preview_id, image_preview_id])
# Act
with (
patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
patch("models.dataset.time.time", return_value=1700000000),
patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
):
signed = segment.get_sign_content()
# Assert
file_signed = f"/files/{file_preview_id}/file-preview?timestamp="
image_signed = f"/files/{image_preview_id}/image-preview?timestamp="
assert file_signed in signed
assert image_signed in signed
assert signed.index(file_signed) < signed.index(image_signed)
assert signed.count("&sign=") == 2
def test_document_segment_with_answer_field(self): def test_document_segment_with_answer_field(self):
"""Test creating a document segment with answer field for QA model.""" """Test creating a document segment with answer field for QA model."""
# Arrange # Arrange