style: normalize import ordering in dataset model

fix: sort signed URL replacements by position
test: avoid MagicMock dependency in segment signing tests
2026-03-02 13:35:10 +00:00 · 2026-03-01 17:40:28 +08:00 · 2026-03-01 17:38:45 +08:00 · 2026-03-01 17:23:18 +08:00 · 2026-03-01 17:12:07 +08:00 · 2026-02-20 18:41:55 +00:00
6 changed files with 193 additions and 79 deletions
--- a/api/core/rag/extractor/pdf_extractor.py
+++ b/api/core/rag/extractor/pdf_extractor.py
@@ -114,7 +114,6 @@ class PdfExtractor(BaseExtractor):
        """
        image_content = []
        upload_files = []
        base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
        try:
            image_objects = page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,))
@@ -164,7 +163,7 @@ class PdfExtractor(BaseExtractor):
                        used_at=naive_utc_now(),
                    )
                    upload_files.append(upload_file)
-                    image_content.append(f"![image]({base_url}/files/{upload_file.id}/file-preview)")
+                    image_content.append(f"![image](/files/{upload_file.id}/file-preview)")
                except Exception as e:
                    logger.warning("Failed to extract image from PDF: %s", e)
                    continue
--- a/api/core/rag/extractor/word_extractor.py
+++ b/api/core/rag/extractor/word_extractor.py
@@ -87,7 +87,6 @@ class WordExtractor(BaseExtractor):
    def _extract_images_from_docx(self, doc):
        image_count = 0
        image_map = {}
        base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
        for r_id, rel in doc.part.rels.items():
            if "image" in rel.target_ref:
@@ -126,7 +125,7 @@ class WordExtractor(BaseExtractor):
                            used_at=naive_utc_now(),
                        )
                        db.session.add(upload_file)
-                        image_map[r_id] = f"![image]({base_url}/files/{upload_file.id}/file-preview)"
+                        image_map[r_id] = f"![image](/files/{upload_file.id}/file-preview)"
                else:
                    image_ext = rel.target_ref.split(".")[-1]
                    if image_ext is None:
@@ -154,7 +153,7 @@ class WordExtractor(BaseExtractor):
                        used_at=naive_utc_now(),
                    )
                    db.session.add(upload_file)
-                    image_map[rel.target_part] = f"![image]({base_url}/files/{upload_file.id}/file-preview)"
+                    image_map[rel.target_part] = f"![image](/files/{upload_file.id}/file-preview)"
        db.session.commit()
        return image_map
--- a/api/models/dataset.py
+++ b/api/models/dataset.py
@@ -10,6 +10,7 @@ import re
 import time
 from datetime import datetime
 from json import JSONDecodeError
 from operator import itemgetter
 from typing import Any, cast
 from uuid import uuid4
@@ -804,41 +805,53 @@ class DocumentSegment(Base):
    def sign_content(self) -> str:
        return self.get_sign_content()
    @staticmethod
    def _build_signed_query_params(*, sign_target: str, upload_file_id: str) -> str:
        nonce = os.urandom(16).hex()
        timestamp = str(int(time.time()))
        data_to_sign = f"{sign_target}|{upload_file_id}|{timestamp}|{nonce}"
        secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
        sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
        encoded_sign = base64.urlsafe_b64encode(sign).decode()
        return f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
    def _get_accessible_upload_file_ids(self, upload_file_ids: set[str]) -> set[str]:
        if not upload_file_ids:
            return set()
        matched_upload_file_ids = db.session.scalars(
            select(UploadFile.id).where(
                UploadFile.tenant_id == self.tenant_id,
                UploadFile.id.in_(list(upload_file_ids)),
            )
        ).all()
        return {str(upload_file_id) for upload_file_id in matched_upload_file_ids}
    def get_sign_content(self) -> str:
        signed_urls: list[tuple[int, int, str]] = []
        text = self.content
-        # For data before v0.10.0
+        upload_file_preview_patterns = {
-        pattern = r"/files/([a-f0-9\-]+)/image-preview(?:\?.*?)?"
+            "image-preview": r"(?:https?://[^\s\)\"\']+)?/files/([a-f0-9\-]+)/image-preview(?:\?[^\s\)\"\']*)?",
-        matches = re.finditer(pattern, text)
+            "file-preview": r"(?:https?://[^\s\)\"\']+)?/files/([a-f0-9\-]+)/file-preview(?:\?[^\s\)\"\']*)?",
-        for match in matches:
+        }
-            upload_file_id = match.group(1)
+        upload_file_matches: list[tuple[re.Match[str], str, str]] = []
-            nonce = os.urandom(16).hex()
+        upload_file_ids: set[str] = set()
            timestamp = str(int(time.time()))
            data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}"
            secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
            sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
            encoded_sign = base64.urlsafe_b64encode(sign).decode()
-            params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
+        for preview_type, pattern in upload_file_preview_patterns.items():
-            base_url = f"/files/{upload_file_id}/image-preview"
+            for match in re.finditer(pattern, text):
-            signed_url = f"{base_url}?{params}"
+                upload_file_id = match.group(1)
-            signed_urls.append((match.start(), match.end(), signed_url))
+                upload_file_matches.append((match, preview_type, upload_file_id))
                upload_file_ids.add(upload_file_id)
-        # For data after v0.10.0
+        accessible_upload_file_ids = self._get_accessible_upload_file_ids(upload_file_ids)
        pattern = r"/files/([a-f0-9\-]+)/file-preview(?:\?.*?)?"
        matches = re.finditer(pattern, text)
        for match in matches:
            upload_file_id = match.group(1)
            nonce = os.urandom(16).hex()
            timestamp = str(int(time.time()))
            data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
            secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
            sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
            encoded_sign = base64.urlsafe_b64encode(sign).decode()
-            params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
+        for match, preview_type, upload_file_id in upload_file_matches:
-            base_url = f"/files/{upload_file_id}/file-preview"
+            if upload_file_id not in accessible_upload_file_ids:
                continue
            params = self._build_signed_query_params(sign_target=preview_type, upload_file_id=upload_file_id)
            base_url = f"/files/{upload_file_id}/{preview_type}"
            signed_url = f"{base_url}?{params}"
            signed_urls.append((match.start(), match.end(), signed_url))
@@ -849,19 +862,13 @@ class DocumentSegment(Base):
        for match in matches:
            upload_file_id = match.group(1)
            file_extension = match.group(2)
-            nonce = os.urandom(16).hex()
+            params = self._build_signed_query_params(sign_target="file-preview", upload_file_id=upload_file_id)
            timestamp = str(int(time.time()))
            data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
            secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
            sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
            encoded_sign = base64.urlsafe_b64encode(sign).decode()
            params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
            base_url = f"/files/tools/{upload_file_id}.{file_extension}"
            signed_url = f"{base_url}?{params}"
            signed_urls.append((match.start(), match.end(), signed_url))
        # Reconstruct the text with signed URLs
        signed_urls.sort(key=itemgetter(0))
        offset = 0
        for start, end, signed_url in signed_urls:
            text = text[: start + offset] + signed_url + text[end + offset :]
--- a/api/tests/unit_tests/core/rag/extractor/test_pdf_extractor.py
+++ b/api/tests/unit_tests/core/rag/extractor/test_pdf_extractor.py
@@ -87,7 +87,7 @@ def test_extract_images_formats(mock_dependencies, monkeypatch, image_bytes, exp
        mock_raw.FPDF_PAGEOBJ_IMAGE = 1
        result = extractor._extract_images(mock_page)
-    assert f"![image](http://files.local/files/{file_id}/file-preview)" in result
+    assert f"![image](/files/{file_id}/file-preview)" in result
    assert len(saves) == 1
    assert saves[0][1] == image_bytes
    assert len(db_stub.session.added) == 1
@@ -180,7 +180,7 @@ def test_extract_images_failures(mock_dependencies):
        result = extractor._extract_images(mock_page)
    # Should have one success
-    assert "![image](http://files.local/files/test_file_id/file-preview)" in result
+    assert "![image](/files/test_file_id/file-preview)" in result
    assert len(saves) == 1
    assert saves[0][1] == jpeg_bytes
    assert db_stub.session.committed is True
--- a/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py
+++ b/api/tests/unit_tests/core/rag/extractor/test_word_extractor.py
@@ -121,8 +121,7 @@ def test_extract_images_from_docx(monkeypatch):
    db_stub = SimpleNamespace(session=DummySession())
    monkeypatch.setattr(we, "db", db_stub)
-    # Patch config values used for URL composition and storage type
+    # Patch config value used in this code path
    monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False)
    monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False)
    # Patch UploadFile to avoid real DB models
@@ -164,7 +163,7 @@ def test_extract_images_from_docx(monkeypatch):
    # Returned map should contain entries for external (keyed by rId) and internal (keyed by target_part)
    assert set(image_map.keys()) == {"rId1", internal_part}
-    assert all(v.startswith("![image](") and v.endswith("/file-preview)") for v in image_map.values())
+    assert all(v.startswith("![image](/files/") and v.endswith("/file-preview)") for v in image_map.values())
    # Storage should receive both payloads
    payloads = {data for _, data in saves}
@@ -176,39 +175,6 @@ def test_extract_images_from_docx(monkeypatch):
    assert db_stub.session.committed is True
 def test_extract_images_from_docx_uses_internal_files_url():
    """Test that INTERNAL_FILES_URL takes precedence over FILES_URL for plugin access."""
    # Test the URL generation logic directly
    from configs import dify_config
    # Mock the configuration values
    original_files_url = getattr(dify_config, "FILES_URL", None)
    original_internal_files_url = getattr(dify_config, "INTERNAL_FILES_URL", None)
    try:
        # Set both URLs - INTERNAL should take precedence
        dify_config.FILES_URL = "http://external.example.com"
        dify_config.INTERNAL_FILES_URL = "http://internal.docker:5001"
        # Test the URL generation logic (same as in word_extractor.py)
        upload_file_id = "test_file_id"
        # This is the pattern we fixed in the word extractor
        base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
        generated_url = f"{base_url}/files/{upload_file_id}/file-preview"
        # Verify that INTERNAL_FILES_URL is used instead of FILES_URL
        assert "http://internal.docker:5001" in generated_url, f"Expected internal URL, got: {generated_url}"
        assert "http://external.example.com" not in generated_url, f"Should not use external URL, got: {generated_url}"
    finally:
        # Restore original values
        if original_files_url is not None:
            dify_config.FILES_URL = original_files_url
        if original_internal_files_url is not None:
            dify_config.INTERNAL_FILES_URL = original_internal_files_url
 def test_extract_hyperlinks(monkeypatch):
    # Mock db and storage to avoid issues during image extraction (even if no images are present)
    monkeypatch.setattr(we, "storage", SimpleNamespace(save=lambda k, d: None))
--- a/api/tests/unit_tests/models/test_dataset_models.py
+++ b/api/tests/unit_tests/models/test_dataset_models.py
@@ -15,6 +15,7 @@ from datetime import UTC, datetime
 from unittest.mock import MagicMock, patch
 from uuid import uuid4
 import models.dataset as dataset_module
 from models.dataset import (
    AppDatasetJoin,
    ChildChunk,
@@ -489,6 +490,15 @@ class TestDocumentModelRelationships:
 class TestDocumentSegmentIndexing:
    """Test suite for DocumentSegment model indexing and operations."""
    @staticmethod
    def _mock_scalars_result(upload_file_ids: list[str]):
        class _ScalarsResult:
            @staticmethod
            def all() -> list[str]:
                return upload_file_ids
        return _ScalarsResult()
    def test_document_segment_creation_with_required_fields(self):
        """Test creating a document segment with all required fields."""
        # Arrange
@@ -547,6 +557,139 @@ class TestDocumentSegmentIndexing:
        assert segment.index_node_hash == index_node_hash
        assert segment.keywords == keywords
    def test_document_segment_sign_content_strips_absolute_files_host(self):
        """Test that sign_content strips scheme/host from absolute /files URLs and returns a signed relative URL."""
        # Arrange
        upload_file_id = "1602650a-4fe4-423c-85a2-af76c083e3c4"
        segment = DocumentSegment(
            tenant_id=str(uuid4()),
            dataset_id=str(uuid4()),
            document_id=str(uuid4()),
            position=1,
            content=f"![image](http://internal.docker:5001/files/{upload_file_id}/file-preview)",
            word_count=1,
            tokens=1,
            created_by=str(uuid4()),
        )
        mock_scalars_result = self._mock_scalars_result([upload_file_id])
        # Act
        with (
            patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
            patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
            patch("models.dataset.time.time", return_value=1700000000),
            patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
        ):
            signed = segment.get_sign_content()
        # Assert
        assert "internal.docker:5001" not in signed
        assert f"/files/{upload_file_id}/file-preview?timestamp=" in signed
        assert "&nonce=" in signed
        assert "&sign=" in signed
    def test_document_segment_sign_content_strips_absolute_files_host_for_image_preview(self):
        """Test that sign_content strips scheme/host from absolute image-preview URLs."""
        # Arrange
        upload_file_id = "e2a4f7b1-1234-5678-9abc-def012345678"
        segment = DocumentSegment(
            tenant_id=str(uuid4()),
            dataset_id=str(uuid4()),
            document_id=str(uuid4()),
            position=1,
            content=f"![image](http://internal.docker:5001/files/{upload_file_id}/image-preview)",
            word_count=1,
            tokens=1,
            created_by=str(uuid4()),
        )
        mock_scalars_result = self._mock_scalars_result([upload_file_id])
        # Act
        with (
            patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
            patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
            patch("models.dataset.time.time", return_value=1700000000),
            patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
        ):
            signed = segment.get_sign_content()
        # Assert
        assert "internal.docker:5001" not in signed
        assert f"/files/{upload_file_id}/image-preview?timestamp=" in signed
        assert "&nonce=" in signed
        assert "&sign=" in signed
    def test_document_segment_sign_content_skips_upload_files_outside_tenant(self):
        """Test that sign_content only signs upload files belonging to the segment tenant."""
        # Arrange
        allowed_upload_file_id = "1602650a-4fe4-423c-85a2-af76c083e3c4"
        denied_upload_file_id = "f8f35fca-568f-4626-adf0-4f30de96aa32"
        segment = DocumentSegment(
            tenant_id=str(uuid4()),
            dataset_id=str(uuid4()),
            document_id=str(uuid4()),
            position=1,
            content=(
                f"allowed: ![image](/files/{allowed_upload_file_id}/file-preview) "
                f"denied: ![image](/files/{denied_upload_file_id}/file-preview)"
            ),
            word_count=1,
            tokens=1,
            created_by=str(uuid4()),
        )
        mock_scalars_result = self._mock_scalars_result([allowed_upload_file_id])
        # Act
        with (
            patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
            patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
            patch("models.dataset.time.time", return_value=1700000000),
            patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
        ):
            signed = segment.get_sign_content()
        # Assert
        assert f"/files/{allowed_upload_file_id}/file-preview?timestamp=" in signed
        assert f"/files/{denied_upload_file_id}/file-preview?timestamp=" not in signed
        assert f"/files/{denied_upload_file_id}/file-preview)" in signed
    def test_document_segment_sign_content_handles_mixed_preview_order(self):
        """Test that sign_content preserves content when file-preview appears before image-preview."""
        # Arrange
        file_preview_id = "1602650a-4fe4-423c-85a2-af76c083e3c4"
        image_preview_id = "e2a4f7b1-1234-5678-9abc-def012345678"
        segment = DocumentSegment(
            tenant_id=str(uuid4()),
            dataset_id=str(uuid4()),
            document_id=str(uuid4()),
            position=1,
            content=(
                f"file-first: ![file](/files/{file_preview_id}/file-preview) "
                f"then-image: ![image](/files/{image_preview_id}/image-preview)"
            ),
            word_count=1,
            tokens=1,
            created_by=str(uuid4()),
        )
        mock_scalars_result = self._mock_scalars_result([file_preview_id, image_preview_id])
        # Act
        with (
            patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
            patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
            patch("models.dataset.time.time", return_value=1700000000),
            patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
        ):
            signed = segment.get_sign_content()
        # Assert
        file_signed = f"/files/{file_preview_id}/file-preview?timestamp="
        image_signed = f"/files/{image_preview_id}/image-preview?timestamp="
        assert file_signed in signed
        assert image_signed in signed
        assert signed.index(file_signed) < signed.index(image_signed)
        assert signed.count("&sign=") == 2
    def test_document_segment_with_answer_field(self):
        """Test creating a document segment with answer field for QA model."""
        # Arrange
Author	SHA1	Message	Date
Yanli 盐粒	60c858aa48	style: normalize import ordering in dataset model	2026-03-01 17:40:28 +08:00
Yanli 盐粒	46a3a2ae09	fix: sort signed URL replacements by position	2026-03-01 17:38:45 +08:00
Yanli 盐粒	30af50cb47	test: avoid MagicMock dependency in segment signing tests	2026-03-01 17:23:18 +08:00
Yanli 盐粒	0d6a4bac0f	fix: harden segment file URL signing	2026-03-01 17:12:07 +08:00
autofix-ci[bot]	c693cb9789	[autofix.ci] apply automated fixes	2026-02-20 18:41:55 +00:00
Yanli 盐粒	fc91a7a38b	Fix docx segment image URLs	2026-02-09 20:14:52 +08:00