mirror of
https://github.com/langgenius/dify.git
synced 2026-01-08 07:14:14 +00:00
feat: support image extraction in PDF RAG extractor (#30399)
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
@@ -112,7 +112,7 @@ class ExtractProcessor:
|
||||
if file_extension in {".xlsx", ".xls"}:
|
||||
extractor = ExcelExtractor(file_path)
|
||||
elif file_extension == ".pdf":
|
||||
extractor = PdfExtractor(file_path)
|
||||
extractor = PdfExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
|
||||
elif file_extension in {".md", ".markdown", ".mdx"}:
|
||||
extractor = (
|
||||
UnstructuredMarkdownExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||
@@ -148,7 +148,7 @@ class ExtractProcessor:
|
||||
if file_extension in {".xlsx", ".xls"}:
|
||||
extractor = ExcelExtractor(file_path)
|
||||
elif file_extension == ".pdf":
|
||||
extractor = PdfExtractor(file_path)
|
||||
extractor = PdfExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
|
||||
elif file_extension in {".md", ".markdown", ".mdx"}:
|
||||
extractor = MarkdownExtractor(file_path, autodetect_encoding=True)
|
||||
elif file_extension in {".htm", ".html"}:
|
||||
|
||||
@@ -1,25 +1,57 @@
|
||||
"""Abstract interface for document loader implementations."""
|
||||
|
||||
import contextlib
|
||||
import io
|
||||
import logging
|
||||
import uuid
|
||||
from collections.abc import Iterator
|
||||
|
||||
import pypdfium2
|
||||
import pypdfium2.raw as pdfium_c
|
||||
|
||||
from configs import dify_config
|
||||
from core.rag.extractor.blob.blob import Blob
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_storage import storage
|
||||
from libs.datetime_utils import naive_utc_now
|
||||
from models.enums import CreatorUserRole
|
||||
from models.model import UploadFile
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PdfExtractor(BaseExtractor):
|
||||
"""Load pdf files.
|
||||
|
||||
"""
|
||||
PdfExtractor is used to extract text and images from PDF files.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to load.
|
||||
file_path: Path to the PDF file.
|
||||
tenant_id: Workspace ID.
|
||||
user_id: ID of the user performing the extraction.
|
||||
file_cache_key: Optional cache key for the extracted text.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str, file_cache_key: str | None = None):
|
||||
"""Initialize with file path."""
|
||||
# Magic bytes for image format detection: (magic_bytes, extension, mime_type)
|
||||
IMAGE_FORMATS = [
|
||||
(b"\xff\xd8\xff", "jpg", "image/jpeg"),
|
||||
(b"\x89PNG\r\n\x1a\n", "png", "image/png"),
|
||||
(b"\x00\x00\x00\x0c\x6a\x50\x20\x20\x0d\x0a\x87\x0a", "jp2", "image/jp2"),
|
||||
(b"GIF8", "gif", "image/gif"),
|
||||
(b"BM", "bmp", "image/bmp"),
|
||||
(b"II*\x00", "tiff", "image/tiff"),
|
||||
(b"MM\x00*", "tiff", "image/tiff"),
|
||||
(b"II+\x00", "tiff", "image/tiff"),
|
||||
(b"MM\x00+", "tiff", "image/tiff"),
|
||||
]
|
||||
MAX_MAGIC_LEN = max(len(m) for m, _, _ in IMAGE_FORMATS)
|
||||
|
||||
def __init__(self, file_path: str, tenant_id: str, user_id: str, file_cache_key: str | None = None):
|
||||
"""Initialize PdfExtractor."""
|
||||
self._file_path = file_path
|
||||
self._tenant_id = tenant_id
|
||||
self._user_id = user_id
|
||||
self._file_cache_key = file_cache_key
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
@@ -50,7 +82,6 @@ class PdfExtractor(BaseExtractor):
|
||||
|
||||
def parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
import pypdfium2 # type: ignore
|
||||
|
||||
with blob.as_bytes_io() as file_path:
|
||||
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
|
||||
@@ -59,8 +90,87 @@ class PdfExtractor(BaseExtractor):
|
||||
text_page = page.get_textpage()
|
||||
content = text_page.get_text_range()
|
||||
text_page.close()
|
||||
|
||||
image_content = self._extract_images(page)
|
||||
if image_content:
|
||||
content += "\n" + image_content
|
||||
|
||||
page.close()
|
||||
metadata = {"source": blob.source, "page": page_number}
|
||||
yield Document(page_content=content, metadata=metadata)
|
||||
finally:
|
||||
pdf_reader.close()
|
||||
|
||||
def _extract_images(self, page) -> str:
|
||||
"""
|
||||
Extract images from a PDF page, save them to storage and database,
|
||||
and return markdown image links.
|
||||
|
||||
Args:
|
||||
page: pypdfium2 page object.
|
||||
|
||||
Returns:
|
||||
Markdown string containing links to the extracted images.
|
||||
"""
|
||||
image_content = []
|
||||
upload_files = []
|
||||
base_url = dify_config.INTERNAL_FILES_URL or dify_config.FILES_URL
|
||||
|
||||
try:
|
||||
image_objects = page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,))
|
||||
for obj in image_objects:
|
||||
try:
|
||||
# Extract image bytes
|
||||
img_byte_arr = io.BytesIO()
|
||||
# Extract DCTDecode (JPEG) and JPXDecode (JPEG 2000) images directly
|
||||
# Fallback to png for other formats
|
||||
obj.extract(img_byte_arr, fb_format="png")
|
||||
img_bytes = img_byte_arr.getvalue()
|
||||
|
||||
if not img_bytes:
|
||||
continue
|
||||
|
||||
header = img_bytes[: self.MAX_MAGIC_LEN]
|
||||
image_ext = None
|
||||
mime_type = None
|
||||
for magic, ext, mime in self.IMAGE_FORMATS:
|
||||
if header.startswith(magic):
|
||||
image_ext = ext
|
||||
mime_type = mime
|
||||
break
|
||||
|
||||
if not image_ext or not mime_type:
|
||||
continue
|
||||
|
||||
file_uuid = str(uuid.uuid4())
|
||||
file_key = "image_files/" + self._tenant_id + "/" + file_uuid + "." + image_ext
|
||||
|
||||
storage.save(file_key, img_bytes)
|
||||
|
||||
# save file to db
|
||||
upload_file = UploadFile(
|
||||
tenant_id=self._tenant_id,
|
||||
storage_type=dify_config.STORAGE_TYPE,
|
||||
key=file_key,
|
||||
name=file_key,
|
||||
size=len(img_bytes),
|
||||
extension=image_ext,
|
||||
mime_type=mime_type,
|
||||
created_by=self._user_id,
|
||||
created_by_role=CreatorUserRole.ACCOUNT,
|
||||
created_at=naive_utc_now(),
|
||||
used=True,
|
||||
used_by=self._user_id,
|
||||
used_at=naive_utc_now(),
|
||||
)
|
||||
upload_files.append(upload_file)
|
||||
image_content.append(f"")
|
||||
except Exception as e:
|
||||
logger.warning("Failed to extract image from PDF: %s", e)
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.warning("Failed to get objects from PDF page: %s", e)
|
||||
if upload_files:
|
||||
db.session.add_all(upload_files)
|
||||
db.session.commit()
|
||||
return "\n".join(image_content)
|
||||
|
||||
186
api/tests/unit_tests/core/rag/extractor/test_pdf_extractor.py
Normal file
186
api/tests/unit_tests/core/rag/extractor/test_pdf_extractor.py
Normal file
@@ -0,0 +1,186 @@
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
import core.rag.extractor.pdf_extractor as pe
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_dependencies(monkeypatch):
|
||||
# Mock storage
|
||||
saves = []
|
||||
|
||||
def save(key, data):
|
||||
saves.append((key, data))
|
||||
|
||||
monkeypatch.setattr(pe, "storage", SimpleNamespace(save=save))
|
||||
|
||||
# Mock db
|
||||
class DummySession:
|
||||
def __init__(self):
|
||||
self.added = []
|
||||
self.committed = False
|
||||
|
||||
def add(self, obj):
|
||||
self.added.append(obj)
|
||||
|
||||
def add_all(self, objs):
|
||||
self.added.extend(objs)
|
||||
|
||||
def commit(self):
|
||||
self.committed = True
|
||||
|
||||
db_stub = SimpleNamespace(session=DummySession())
|
||||
monkeypatch.setattr(pe, "db", db_stub)
|
||||
|
||||
# Mock UploadFile
|
||||
class FakeUploadFile:
|
||||
DEFAULT_ID = "test_file_id"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
# Assign id from DEFAULT_ID, allow override via kwargs if needed
|
||||
self.id = self.DEFAULT_ID
|
||||
for k, v in kwargs.items():
|
||||
setattr(self, k, v)
|
||||
|
||||
monkeypatch.setattr(pe, "UploadFile", FakeUploadFile)
|
||||
|
||||
# Mock config
|
||||
monkeypatch.setattr(pe.dify_config, "FILES_URL", "http://files.local")
|
||||
monkeypatch.setattr(pe.dify_config, "INTERNAL_FILES_URL", None)
|
||||
monkeypatch.setattr(pe.dify_config, "STORAGE_TYPE", "local")
|
||||
|
||||
return SimpleNamespace(saves=saves, db=db_stub, UploadFile=FakeUploadFile)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("image_bytes", "expected_mime", "expected_ext", "file_id"),
|
||||
[
|
||||
(b"\xff\xd8\xff some jpeg", "image/jpeg", "jpg", "test_file_id_jpeg"),
|
||||
(b"\x89PNG\r\n\x1a\n some png", "image/png", "png", "test_file_id_png"),
|
||||
],
|
||||
)
|
||||
def test_extract_images_formats(mock_dependencies, monkeypatch, image_bytes, expected_mime, expected_ext, file_id):
|
||||
saves = mock_dependencies.saves
|
||||
db_stub = mock_dependencies.db
|
||||
|
||||
# Customize FakeUploadFile id for this test case.
|
||||
# Using monkeypatch ensures the class attribute is reset between parameter sets.
|
||||
monkeypatch.setattr(mock_dependencies.UploadFile, "DEFAULT_ID", file_id)
|
||||
|
||||
# Mock page and image objects
|
||||
mock_page = MagicMock()
|
||||
mock_image_obj = MagicMock()
|
||||
|
||||
def mock_extract(buf, fb_format=None):
|
||||
buf.write(image_bytes)
|
||||
|
||||
mock_image_obj.extract.side_effect = mock_extract
|
||||
|
||||
mock_page.get_objects.return_value = [mock_image_obj]
|
||||
|
||||
extractor = pe.PdfExtractor(file_path="test.pdf", tenant_id="t1", user_id="u1")
|
||||
|
||||
# We need to handle the import inside _extract_images
|
||||
with patch("pypdfium2.raw") as mock_raw:
|
||||
mock_raw.FPDF_PAGEOBJ_IMAGE = 1
|
||||
result = extractor._extract_images(mock_page)
|
||||
|
||||
assert f"" in result
|
||||
assert len(saves) == 1
|
||||
assert saves[0][1] == image_bytes
|
||||
assert len(db_stub.session.added) == 1
|
||||
assert db_stub.session.added[0].tenant_id == "t1"
|
||||
assert db_stub.session.added[0].size == len(image_bytes)
|
||||
assert db_stub.session.added[0].mime_type == expected_mime
|
||||
assert db_stub.session.added[0].extension == expected_ext
|
||||
assert db_stub.session.committed is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("get_objects_side_effect", "get_objects_return_value"),
|
||||
[
|
||||
(None, []), # Empty list
|
||||
(None, None), # None returned
|
||||
(Exception("Failed to get objects"), None), # Exception raised
|
||||
],
|
||||
)
|
||||
def test_extract_images_get_objects_scenarios(mock_dependencies, get_objects_side_effect, get_objects_return_value):
|
||||
mock_page = MagicMock()
|
||||
if get_objects_side_effect:
|
||||
mock_page.get_objects.side_effect = get_objects_side_effect
|
||||
else:
|
||||
mock_page.get_objects.return_value = get_objects_return_value
|
||||
|
||||
extractor = pe.PdfExtractor(file_path="test.pdf", tenant_id="t1", user_id="u1")
|
||||
|
||||
with patch("pypdfium2.raw") as mock_raw:
|
||||
mock_raw.FPDF_PAGEOBJ_IMAGE = 1
|
||||
result = extractor._extract_images(mock_page)
|
||||
|
||||
assert result == ""
|
||||
|
||||
|
||||
def test_extract_calls_extract_images(mock_dependencies, monkeypatch):
|
||||
# Mock pypdfium2
|
||||
mock_pdf_doc = MagicMock()
|
||||
mock_page = MagicMock()
|
||||
mock_pdf_doc.__iter__.return_value = [mock_page]
|
||||
|
||||
# Mock text extraction
|
||||
mock_text_page = MagicMock()
|
||||
mock_text_page.get_text_range.return_value = "Page text content"
|
||||
mock_page.get_textpage.return_value = mock_text_page
|
||||
|
||||
with patch("pypdfium2.PdfDocument", return_value=mock_pdf_doc):
|
||||
# Mock Blob
|
||||
mock_blob = MagicMock()
|
||||
mock_blob.source = "test.pdf"
|
||||
with patch("core.rag.extractor.pdf_extractor.Blob.from_path", return_value=mock_blob):
|
||||
extractor = pe.PdfExtractor(file_path="test.pdf", tenant_id="t1", user_id="u1")
|
||||
|
||||
# Mock _extract_images to return a known string
|
||||
monkeypatch.setattr(extractor, "_extract_images", lambda p: "")
|
||||
|
||||
documents = list(extractor.extract())
|
||||
|
||||
assert len(documents) == 1
|
||||
assert "Page text content" in documents[0].page_content
|
||||
assert "" in documents[0].page_content
|
||||
assert documents[0].metadata["page"] == 0
|
||||
|
||||
|
||||
def test_extract_images_failures(mock_dependencies):
|
||||
saves = mock_dependencies.saves
|
||||
db_stub = mock_dependencies.db
|
||||
|
||||
# Mock page and image objects
|
||||
mock_page = MagicMock()
|
||||
mock_image_obj_fail = MagicMock()
|
||||
mock_image_obj_ok = MagicMock()
|
||||
|
||||
# First image raises exception
|
||||
mock_image_obj_fail.extract.side_effect = Exception("Extraction failure")
|
||||
|
||||
# Second image is OK (JPEG)
|
||||
jpeg_bytes = b"\xff\xd8\xff some image data"
|
||||
|
||||
def mock_extract(buf, fb_format=None):
|
||||
buf.write(jpeg_bytes)
|
||||
|
||||
mock_image_obj_ok.extract.side_effect = mock_extract
|
||||
|
||||
mock_page.get_objects.return_value = [mock_image_obj_fail, mock_image_obj_ok]
|
||||
|
||||
extractor = pe.PdfExtractor(file_path="test.pdf", tenant_id="t1", user_id="u1")
|
||||
|
||||
with patch("pypdfium2.raw") as mock_raw:
|
||||
mock_raw.FPDF_PAGEOBJ_IMAGE = 1
|
||||
result = extractor._extract_images(mock_page)
|
||||
|
||||
# Should have one success
|
||||
assert "" in result
|
||||
assert len(saves) == 1
|
||||
assert saves[0][1] == jpeg_bytes
|
||||
assert db_stub.session.committed is True
|
||||
Reference in New Issue
Block a user