Compare commits

..

1 Commits

Author SHA1 Message Date
Stephen Zhou
e535a1def2 test: stable test 2026-02-09 16:40:29 +08:00
7 changed files with 94 additions and 207 deletions

View File

@@ -114,6 +114,7 @@ class PdfExtractor(BaseExtractor):
"""
image_content = []
upload_files = []
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
try:
image_objects = page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,))
@@ -163,7 +164,7 @@ class PdfExtractor(BaseExtractor):
used_at=naive_utc_now(),
)
upload_files.append(upload_file)
image_content.append(f"![image](/files/{upload_file.id}/file-preview)")
image_content.append(f"![image]({base_url}/files/{upload_file.id}/file-preview)")
except Exception as e:
logger.warning("Failed to extract image from PDF: %s", e)
continue

View File

@@ -87,6 +87,7 @@ class WordExtractor(BaseExtractor):
def _extract_images_from_docx(self, doc):
image_count = 0
image_map = {}
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
for r_id, rel in doc.part.rels.items():
if "image" in rel.target_ref:
@@ -125,7 +126,7 @@ class WordExtractor(BaseExtractor):
used_at=naive_utc_now(),
)
db.session.add(upload_file)
image_map[r_id] = f"![image](/files/{upload_file.id}/file-preview)"
image_map[r_id] = f"![image]({base_url}/files/{upload_file.id}/file-preview)"
else:
image_ext = rel.target_ref.split(".")[-1]
if image_ext is None:
@@ -153,7 +154,7 @@ class WordExtractor(BaseExtractor):
used_at=naive_utc_now(),
)
db.session.add(upload_file)
image_map[rel.target_part] = f"![image](/files/{upload_file.id}/file-preview)"
image_map[rel.target_part] = f"![image]({base_url}/files/{upload_file.id}/file-preview)"
db.session.commit()
return image_map

View File

@@ -10,7 +10,6 @@ import re
import time
from datetime import datetime
from json import JSONDecodeError
from operator import itemgetter
from typing import Any, cast
from uuid import uuid4
@@ -805,53 +804,41 @@ class DocumentSegment(Base):
def sign_content(self) -> str:
return self.get_sign_content()
@staticmethod
def _build_signed_query_params(*, sign_target: str, upload_file_id: str) -> str:
nonce = os.urandom(16).hex()
timestamp = str(int(time.time()))
data_to_sign = f"{sign_target}|{upload_file_id}|{timestamp}|{nonce}"
secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
encoded_sign = base64.urlsafe_b64encode(sign).decode()
return f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
def _get_accessible_upload_file_ids(self, upload_file_ids: set[str]) -> set[str]:
if not upload_file_ids:
return set()
matched_upload_file_ids = db.session.scalars(
select(UploadFile.id).where(
UploadFile.tenant_id == self.tenant_id,
UploadFile.id.in_(list(upload_file_ids)),
)
).all()
return {str(upload_file_id) for upload_file_id in matched_upload_file_ids}
def get_sign_content(self) -> str:
signed_urls: list[tuple[int, int, str]] = []
text = self.content
upload_file_preview_patterns = {
"image-preview": r"(?:https?://[^\s\)\"\']+)?/files/([a-f0-9\-]+)/image-preview(?:\?[^\s\)\"\']*)?",
"file-preview": r"(?:https?://[^\s\)\"\']+)?/files/([a-f0-9\-]+)/file-preview(?:\?[^\s\)\"\']*)?",
}
upload_file_matches: list[tuple[re.Match[str], str, str]] = []
upload_file_ids: set[str] = set()
# For data before v0.10.0
pattern = r"/files/([a-f0-9\-]+)/image-preview(?:\?.*?)?"
matches = re.finditer(pattern, text)
for match in matches:
upload_file_id = match.group(1)
nonce = os.urandom(16).hex()
timestamp = str(int(time.time()))
data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}"
secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
encoded_sign = base64.urlsafe_b64encode(sign).decode()
for preview_type, pattern in upload_file_preview_patterns.items():
for match in re.finditer(pattern, text):
upload_file_id = match.group(1)
upload_file_matches.append((match, preview_type, upload_file_id))
upload_file_ids.add(upload_file_id)
params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
base_url = f"/files/{upload_file_id}/image-preview"
signed_url = f"{base_url}?{params}"
signed_urls.append((match.start(), match.end(), signed_url))
accessible_upload_file_ids = self._get_accessible_upload_file_ids(upload_file_ids)
# For data after v0.10.0
pattern = r"/files/([a-f0-9\-]+)/file-preview(?:\?.*?)?"
matches = re.finditer(pattern, text)
for match in matches:
upload_file_id = match.group(1)
nonce = os.urandom(16).hex()
timestamp = str(int(time.time()))
data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
encoded_sign = base64.urlsafe_b64encode(sign).decode()
for match, preview_type, upload_file_id in upload_file_matches:
if upload_file_id not in accessible_upload_file_ids:
continue
params = self._build_signed_query_params(sign_target=preview_type, upload_file_id=upload_file_id)
base_url = f"/files/{upload_file_id}/{preview_type}"
params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
base_url = f"/files/{upload_file_id}/file-preview"
signed_url = f"{base_url}?{params}"
signed_urls.append((match.start(), match.end(), signed_url))
@@ -862,13 +849,19 @@ class DocumentSegment(Base):
for match in matches:
upload_file_id = match.group(1)
file_extension = match.group(2)
params = self._build_signed_query_params(sign_target="file-preview", upload_file_id=upload_file_id)
nonce = os.urandom(16).hex()
timestamp = str(int(time.time()))
data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
encoded_sign = base64.urlsafe_b64encode(sign).decode()
params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
base_url = f"/files/tools/{upload_file_id}.{file_extension}"
signed_url = f"{base_url}?{params}"
signed_urls.append((match.start(), match.end(), signed_url))
# Reconstruct the text with signed URLs
signed_urls.sort(key=itemgetter(0))
offset = 0
for start, end, signed_url in signed_urls:
text = text[: start + offset] + signed_url + text[end + offset :]

View File

@@ -87,7 +87,7 @@ def test_extract_images_formats(mock_dependencies, monkeypatch, image_bytes, exp
mock_raw.FPDF_PAGEOBJ_IMAGE = 1
result = extractor._extract_images(mock_page)
assert f"![image](/files/{file_id}/file-preview)" in result
assert f"![image](http://files.local/files/{file_id}/file-preview)" in result
assert len(saves) == 1
assert saves[0][1] == image_bytes
assert len(db_stub.session.added) == 1
@@ -180,7 +180,7 @@ def test_extract_images_failures(mock_dependencies):
result = extractor._extract_images(mock_page)
# Should have one success
assert "![image](/files/test_file_id/file-preview)" in result
assert "![image](http://files.local/files/test_file_id/file-preview)" in result
assert len(saves) == 1
assert saves[0][1] == jpeg_bytes
assert db_stub.session.committed is True

View File

@@ -121,7 +121,8 @@ def test_extract_images_from_docx(monkeypatch):
db_stub = SimpleNamespace(session=DummySession())
monkeypatch.setattr(we, "db", db_stub)
# Patch config value used in this code path
# Patch config values used for URL composition and storage type
monkeypatch.setattr(we.dify_config, "FILES_URL", "http://files.local", raising=False)
monkeypatch.setattr(we.dify_config, "STORAGE_TYPE", "local", raising=False)
# Patch UploadFile to avoid real DB models
@@ -163,7 +164,7 @@ def test_extract_images_from_docx(monkeypatch):
# Returned map should contain entries for external (keyed by rId) and internal (keyed by target_part)
assert set(image_map.keys()) == {"rId1", internal_part}
assert all(v.startswith("![image](/files/") and v.endswith("/file-preview)") for v in image_map.values())
assert all(v.startswith("![image](") and v.endswith("/file-preview)") for v in image_map.values())
# Storage should receive both payloads
payloads = {data for _, data in saves}
@@ -175,6 +176,39 @@ def test_extract_images_from_docx(monkeypatch):
assert db_stub.session.committed is True
def test_extract_images_from_docx_uses_internal_files_url():
"""Test that INTERNAL_FILES_URL takes precedence over FILES_URL for plugin access."""
# Test the URL generation logic directly
from configs import dify_config
# Mock the configuration values
original_files_url = getattr(dify_config, "FILES_URL", None)
original_internal_files_url = getattr(dify_config, "INTERNAL_FILES_URL", None)
try:
# Set both URLs - INTERNAL should take precedence
dify_config.FILES_URL = "http://external.example.com"
dify_config.INTERNAL_FILES_URL = "http://internal.docker:5001"
# Test the URL generation logic (same as in word_extractor.py)
upload_file_id = "test_file_id"
# This is the pattern we fixed in the word extractor
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
generated_url = f"{base_url}/files/{upload_file_id}/file-preview"
# Verify that INTERNAL_FILES_URL is used instead of FILES_URL
assert "http://internal.docker:5001" in generated_url, f"Expected internal URL, got: {generated_url}"
assert "http://external.example.com" not in generated_url, f"Should not use external URL, got: {generated_url}"
finally:
# Restore original values
if original_files_url is not None:
dify_config.FILES_URL = original_files_url
if original_internal_files_url is not None:
dify_config.INTERNAL_FILES_URL = original_internal_files_url
def test_extract_hyperlinks(monkeypatch):
# Mock db and storage to avoid issues during image extraction (even if no images are present)
monkeypatch.setattr(we, "storage", SimpleNamespace(save=lambda k, d: None))

View File

@@ -15,7 +15,6 @@ from datetime import UTC, datetime
from unittest.mock import MagicMock, patch
from uuid import uuid4
import models.dataset as dataset_module
from models.dataset import (
AppDatasetJoin,
ChildChunk,
@@ -490,15 +489,6 @@ class TestDocumentModelRelationships:
class TestDocumentSegmentIndexing:
"""Test suite for DocumentSegment model indexing and operations."""
@staticmethod
def _mock_scalars_result(upload_file_ids: list[str]):
class _ScalarsResult:
@staticmethod
def all() -> list[str]:
return upload_file_ids
return _ScalarsResult()
def test_document_segment_creation_with_required_fields(self):
"""Test creating a document segment with all required fields."""
# Arrange
@@ -557,139 +547,6 @@ class TestDocumentSegmentIndexing:
assert segment.index_node_hash == index_node_hash
assert segment.keywords == keywords
def test_document_segment_sign_content_strips_absolute_files_host(self):
"""Test that sign_content strips scheme/host from absolute /files URLs and returns a signed relative URL."""
# Arrange
upload_file_id = "1602650a-4fe4-423c-85a2-af76c083e3c4"
segment = DocumentSegment(
tenant_id=str(uuid4()),
dataset_id=str(uuid4()),
document_id=str(uuid4()),
position=1,
content=f"![image](http://internal.docker:5001/files/{upload_file_id}/file-preview)",
word_count=1,
tokens=1,
created_by=str(uuid4()),
)
mock_scalars_result = self._mock_scalars_result([upload_file_id])
# Act
with (
patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
patch("models.dataset.time.time", return_value=1700000000),
patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
):
signed = segment.get_sign_content()
# Assert
assert "internal.docker:5001" not in signed
assert f"/files/{upload_file_id}/file-preview?timestamp=" in signed
assert "&nonce=" in signed
assert "&sign=" in signed
def test_document_segment_sign_content_strips_absolute_files_host_for_image_preview(self):
"""Test that sign_content strips scheme/host from absolute image-preview URLs."""
# Arrange
upload_file_id = "e2a4f7b1-1234-5678-9abc-def012345678"
segment = DocumentSegment(
tenant_id=str(uuid4()),
dataset_id=str(uuid4()),
document_id=str(uuid4()),
position=1,
content=f"![image](http://internal.docker:5001/files/{upload_file_id}/image-preview)",
word_count=1,
tokens=1,
created_by=str(uuid4()),
)
mock_scalars_result = self._mock_scalars_result([upload_file_id])
# Act
with (
patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
patch("models.dataset.time.time", return_value=1700000000),
patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
):
signed = segment.get_sign_content()
# Assert
assert "internal.docker:5001" not in signed
assert f"/files/{upload_file_id}/image-preview?timestamp=" in signed
assert "&nonce=" in signed
assert "&sign=" in signed
def test_document_segment_sign_content_skips_upload_files_outside_tenant(self):
"""Test that sign_content only signs upload files belonging to the segment tenant."""
# Arrange
allowed_upload_file_id = "1602650a-4fe4-423c-85a2-af76c083e3c4"
denied_upload_file_id = "f8f35fca-568f-4626-adf0-4f30de96aa32"
segment = DocumentSegment(
tenant_id=str(uuid4()),
dataset_id=str(uuid4()),
document_id=str(uuid4()),
position=1,
content=(
f"allowed: ![image](/files/{allowed_upload_file_id}/file-preview) "
f"denied: ![image](/files/{denied_upload_file_id}/file-preview)"
),
word_count=1,
tokens=1,
created_by=str(uuid4()),
)
mock_scalars_result = self._mock_scalars_result([allowed_upload_file_id])
# Act
with (
patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
patch("models.dataset.time.time", return_value=1700000000),
patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
):
signed = segment.get_sign_content()
# Assert
assert f"/files/{allowed_upload_file_id}/file-preview?timestamp=" in signed
assert f"/files/{denied_upload_file_id}/file-preview?timestamp=" not in signed
assert f"/files/{denied_upload_file_id}/file-preview)" in signed
def test_document_segment_sign_content_handles_mixed_preview_order(self):
"""Test that sign_content preserves content when file-preview appears before image-preview."""
# Arrange
file_preview_id = "1602650a-4fe4-423c-85a2-af76c083e3c4"
image_preview_id = "e2a4f7b1-1234-5678-9abc-def012345678"
segment = DocumentSegment(
tenant_id=str(uuid4()),
dataset_id=str(uuid4()),
document_id=str(uuid4()),
position=1,
content=(
f"file-first: ![file](/files/{file_preview_id}/file-preview) "
f"then-image: ![image](/files/{image_preview_id}/image-preview)"
),
word_count=1,
tokens=1,
created_by=str(uuid4()),
)
mock_scalars_result = self._mock_scalars_result([file_preview_id, image_preview_id])
# Act
with (
patch.object(dataset_module.dify_config, "SECRET_KEY", "secret", create=True),
patch("models.dataset.db.session.scalars", return_value=mock_scalars_result),
patch("models.dataset.time.time", return_value=1700000000),
patch("models.dataset.os.urandom", return_value=b"\x00" * 16),
):
signed = segment.get_sign_content()
# Assert
file_signed = f"/files/{file_preview_id}/file-preview?timestamp="
image_signed = f"/files/{image_preview_id}/image-preview?timestamp="
assert file_signed in signed
assert image_signed in signed
assert signed.index(file_signed) < signed.index(image_signed)
assert signed.count("&sign=") == 2
def test_document_segment_with_answer_field(self):
"""Test creating a document segment with answer field for QA model."""
# Arrange

View File

@@ -3,8 +3,7 @@ import type { MockedFunction } from 'vitest'
import type { IndexingType } from '@/app/components/datasets/create/step-two'
import type { DataSet } from '@/models/datasets'
import type { RetrievalConfig } from '@/types/app'
import { fireEvent, render, screen, waitFor, within } from '@testing-library/react'
import userEvent from '@testing-library/user-event'
import { fireEvent, render, screen, waitFor } from '@testing-library/react'
import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
import { ChunkingMode, DatasetPermission, DataSourceType } from '@/models/datasets'
import { RETRIEVE_METHOD } from '@/types/app'
@@ -137,6 +136,12 @@ const renderItem = (config: DataSet, props?: Partial<React.ComponentProps<typeof
return { onSave, onRemove }
}
const getActionButtons = (card: HTMLElement) => {
const actionButtons = Array.from(card.querySelectorAll<HTMLButtonElement>('button.action-btn'))
expect(actionButtons).toHaveLength(2)
return actionButtons
}
describe('dataset-config/card-item', () => {
beforeEach(() => {
vi.clearAllMocks()
@@ -155,7 +160,7 @@ describe('dataset-config/card-item', () => {
renderItem(dataset)
const card = screen.getByText(dataset.name).closest('.group') as HTMLElement
const actionButtons = within(card).getAllByRole('button', { hidden: true })
const actionButtons = getActionButtons(card)
expect(screen.getByText(dataset.name)).toBeInTheDocument()
expect(screen.getByText('dataset.indexingTechnique.high_quality · dataset.indexingMethod.semantic_search')).toBeInTheDocument()
@@ -164,20 +169,19 @@ describe('dataset-config/card-item', () => {
})
it('should open settings drawer from edit action and close after saving', async () => {
const user = userEvent.setup()
const dataset = createDataset()
const { onSave } = renderItem(dataset)
const card = screen.getByText(dataset.name).closest('.group') as HTMLElement
const [editButton] = within(card).getAllByRole('button', { hidden: true })
await user.click(editButton)
const [editButton] = getActionButtons(card)
fireEvent.click(editButton)
expect(screen.getByText('Mock settings modal')).toBeInTheDocument()
await waitFor(() => {
expect(screen.getByRole('dialog')).toBeVisible()
})
await user.click(screen.getByText('Save changes'))
fireEvent.click(screen.getByRole('button', { name: 'Save changes' }))
await waitFor(() => {
expect(onSave).toHaveBeenCalledWith(expect.objectContaining({ name: 'Updated dataset' }))
@@ -188,13 +192,11 @@ describe('dataset-config/card-item', () => {
})
it('should call onRemove and toggle destructive state on hover', async () => {
const user = userEvent.setup()
const dataset = createDataset()
const { onRemove } = renderItem(dataset)
const card = screen.getByText(dataset.name).closest('.group') as HTMLElement
const buttons = within(card).getAllByRole('button', { hidden: true })
const deleteButton = buttons[buttons.length - 1]
const [, deleteButton] = getActionButtons(card)
expect(deleteButton.className).not.toContain('action-btn-destructive')
@@ -205,7 +207,7 @@ describe('dataset-config/card-item', () => {
fireEvent.mouseLeave(deleteButton)
expect(deleteButton.className).not.toContain('action-btn-destructive')
await user.click(deleteButton)
fireEvent.click(deleteButton)
expect(onRemove).toHaveBeenCalledWith(dataset.id)
})
@@ -223,14 +225,13 @@ describe('dataset-config/card-item', () => {
it('should apply mask overlay on mobile when drawer is open', async () => {
mockedUseBreakpoints.mockReturnValue(MediaType.mobile)
const user = userEvent.setup()
const dataset = createDataset()
renderItem(dataset)
const card = screen.getByText(dataset.name).closest('.group') as HTMLElement
const [editButton] = within(card).getAllByRole('button', { hidden: true })
await user.click(editButton)
const [editButton] = getActionButtons(card)
fireEvent.click(editButton)
expect(screen.getByText('Mock settings modal')).toBeInTheDocument()
const overlay = Array.from(document.querySelectorAll('[class]'))