Source code for sycamore.functions.document

from io import BytesIO
from typing import Optional

from sycamore.data.document import DocumentPropertyTypes
from sycamore.data.element import TableElement

import pdf2image

from sycamore.data import Document, Element
from sycamore.utils.image_utils import try_draw_boxes
from sycamore.utils.time_trace import timetrace
from PIL import Image as PImage, ImageDraw



[docs]
@timetrace("Pdf2Imgs")
def split_and_convert_to_image(doc: Document) -> list[Document]:
    """Split a document into individual pages as images and convert them into Document objects.

    This function takes a Document object, which may represent a multi-page document, and splits it into individual
    pages. Each page is converted into an image, and a new Document object is created for each page. The resulting
    list contains these new Document objects, each representing one page of the original document and elements making
    up the page.

    The input Document object should have a binary_representation attribute containing the binary data of the pdf
    document. Each page's elements are preserved in the new Document objects, and page-specific properties
    are updated to reflect the image's size, mode, and page number.

    Args:
        doc: The input Document to split and convert.

    Returns:
        A list of Document objects, each representing a single page of the original document as an image and
        elements making up the page.

    Example:
         .. code-block:: python

            input_doc = Document(binary_representation=pdf_bytes, elements=elements, properties={"author": "John Doe"})
            page_docs = split_and_convert_to_image(input_doc)

    """

    if doc.binary_representation is not None:
        images = pdf2image.convert_from_bytes(doc.binary_representation)
    else:
        return [doc]

    elements_by_page: dict[int, list[Element]] = {}

    for e in doc.elements:
        page_number = e.properties[DocumentPropertyTypes.PAGE_NUMBER]
        elements_by_page.setdefault(page_number, []).append(e)

    new_docs = []
    for page, image in enumerate(images):
        elements = elements_by_page.get(page + 1, [])
        new_doc = Document(binary_representation=image.tobytes(), elements=elements)
        new_doc.properties.update(doc.properties)
        new_doc.properties.update(
            {"size": list(image.size), "mode": image.mode, DocumentPropertyTypes.PAGE_NUMBER: page + 1}
        )
        new_docs.append(new_doc)
    return new_docs




[docs]
class DrawBoxes:
    """
    DrawBoxes is a class for adding/drawing boxes around elements within images represented as Document objects.

    This class is designed to enhance Document objects representing images with elements (e.g., text boxes, tables)
    by drawing bounding boxes around each element. It also allows you to customize the color mapping for different
    element types.

    Args:
        font_path: The path to the TrueType font file to be used for labeling.
        default_color: The default color for bounding boxes when the element type is unknown.

    Example:

          .. code-block:: python

            context = sycamore.init()

            font_path="path/to/font.ttf"

            pdf_docset = context.read.binary(paths, binary_format="pdf")
                .partition(partitioner=ArynPartitioner())
                .flat_map(split_and_convert_to_image)
                .map_batch(DrawBoxes, f_constructor_args=[font_path])
    """

    def __init__(self, font_path: Optional[str] = None, default_color: str = "blue", draw_table_cells: bool = True):
        self.font_path = font_path
        self.color_map = {
            "Title": "red",
            "NarrativeText": "blue",
            "UncategorizedText": "blue",
            "ListItem": "green",
            "table": "orange",
        }
        self.default_color = default_color
        self.draw_table_cells = draw_table_cells

    def _get_color(self, element: Element):
        if element.type is None:
            return self.default_color
        return self.color_map.get(element.type, self.default_color)

    def _draw_boxes(self, doc: Document) -> Document:
        size = tuple(doc.properties["size"])
        mode = doc.properties["mode"]
        assert doc.binary_representation is not None, "Document must have binary representation to render as PNG"
        image = PImage.frombytes(mode=mode, size=size, data=doc.binary_representation)
        canvas = ImageDraw.Draw(image)

        try_draw_boxes(
            canvas, doc.elements, text_fn=lambda e, _: e.type, color_fn=self._get_color, font_path=self.font_path
        )

        if self.draw_table_cells:
            for e in doc.elements:
                if isinstance(e, TableElement) and e.table is not None:
                    e.table.draw(canvas)

        png_image = BytesIO()
        image.save(png_image, format="PNG")
        doc.binary_representation = png_image.getvalue()
        return doc

    def __call__(self, docs: list[Document]) -> list[Document]:
        return [self._draw_boxes(d) for d in docs]