Files
Zhu Yongbo 5a50ae0471 Add new UI/new features for EC-RAG (#1665)
Signed-off-by: Zhu, Yongbo <yongbo.zhu@intel.com>
2025-03-20 10:46:01 +08:00

32 lines
1.2 KiB
Python

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import io
import os
from typing import Iterator
from docx.text.paragraph import Paragraph
from PIL import Image as Img
from unstructured.documents.elements import ElementMetadata, Image
from unstructured.partition.docx import DocxPartitionerOptions
UI_DIRECTORY = os.getenv("UI_TMPFILE_PATH", "/home/user/ui_cache")
IMG_OUTPUT_DIR = os.path.join(UI_DIRECTORY, "pic")
os.makedirs(IMG_OUTPUT_DIR, exist_ok=True)
class DocxParagraphPicturePartitioner:
@classmethod
def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
imgs = paragraph._element.xpath(".//pic:pic")
if imgs:
for img in imgs:
embed = img.xpath(".//a:blip/@r:embed")[0]
related_part = opts.document.part.related_parts[embed]
image_blob = related_part.blob
image = Img.open(io.BytesIO(image_blob))
image_path = os.path.join(IMG_OUTPUT_DIR, str(embed) + related_part.sha1 + ".png")
image.save(image_path)
element_metadata = ElementMetadata(image_path=image_path)
yield Image(text="IMAGE", metadata=element_metadata)