from io import BytesIO
from typing import Optional
from sycamore.data.document import DocumentPropertyTypes
from sycamore.data.element import TableElement
import pdf2image
from sycamore.data import Document, Element
from sycamore.utils.image_utils import try_draw_boxes
from sycamore.utils.time_trace import timetrace
from PIL import Image as PImage, ImageDraw
[docs]
@timetrace("Pdf2Imgs")
def split_and_convert_to_image(doc: Document) -> list[Document]:
"""Split a document into individual pages as images and convert them into Document objects.
This function takes a Document object, which may represent a multi-page document, and splits it into individual
pages. Each page is converted into an image, and a new Document object is created for each page. The resulting
list contains these new Document objects, each representing one page of the original document and elements making
up the page.
The input Document object should have a binary_representation attribute containing the binary data of the pdf
document. Each page's elements are preserved in the new Document objects, and page-specific properties
are updated to reflect the image's size, mode, and page number.
Args:
doc: The input Document to split and convert.
Returns:
A list of Document objects, each representing a single page of the original document as an image and
elements making up the page.
Example:
.. code-block:: python
input_doc = Document(binary_representation=pdf_bytes, elements=elements, properties={"author": "John Doe"})
page_docs = split_and_convert_to_image(input_doc)
"""
if doc.binary_representation is not None:
images = pdf2image.convert_from_bytes(doc.binary_representation)
else:
return [doc]
elements_by_page: dict[int, list[Element]] = {}
for e in doc.elements:
page_number = e.properties[DocumentPropertyTypes.PAGE_NUMBER]
elements_by_page.setdefault(page_number, []).append(e)
new_docs = []
for page, image in enumerate(images):
elements = elements_by_page.get(page + 1, [])
new_doc = Document(binary_representation=image.tobytes(), elements=elements)
new_doc.properties.update(doc.properties)
new_doc.properties.update(
{"size": list(image.size), "mode": image.mode, DocumentPropertyTypes.PAGE_NUMBER: page + 1}
)
new_docs.append(new_doc)
return new_docs
[docs]
class DrawBoxes:
"""
DrawBoxes is a class for adding/drawing boxes around elements within images represented as Document objects.
This class is designed to enhance Document objects representing images with elements (e.g., text boxes, tables)
by drawing bounding boxes around each element. It also allows you to customize the color mapping for different
element types.
Args:
font_path: The path to the TrueType font file to be used for labeling.
default_color: The default color for bounding boxes when the element type is unknown.
Example:
.. code-block:: python
context = sycamore.init()
font_path="path/to/font.ttf"
pdf_docset = context.read.binary(paths, binary_format="pdf")
.partition(partitioner=UnstructuredPdfPartitioner())
.flat_map(split_and_convert_to_image)
.map_batch(DrawBoxes, f_constructor_args=[font_path])
"""
def __init__(self, font_path: Optional[str] = None, default_color: str = "blue", draw_table_cells: bool = True):
self.font_path = font_path
self.color_map = {
"Title": "red",
"NarrativeText": "blue",
"UncategorizedText": "blue",
"ListItem": "green",
"table": "orange",
}
self.default_color = default_color
self.draw_table_cells = draw_table_cells
def _get_color(self, element: Element):
if element.type is None:
return self.default_color
return self.color_map.get(element.type, self.default_color)
def _draw_boxes(self, doc: Document) -> Document:
size = tuple(doc.properties["size"])
mode = doc.properties["mode"]
assert doc.binary_representation is not None, "Document must have binary representation to render as PNG"
image = PImage.frombytes(mode=mode, size=size, data=doc.binary_representation)
canvas = ImageDraw.Draw(image)
try_draw_boxes(
canvas, doc.elements, text_fn=lambda e, _: e.type, color_fn=self._get_color, font_path=self.font_path
)
if self.draw_table_cells:
for e in doc.elements:
if isinstance(e, TableElement) and e.table is not None:
e.table.draw(canvas)
png_image = BytesIO()
image.save(png_image, format="PNG")
doc.binary_representation = png_image.getvalue()
return doc
def __call__(self, docs: list[Document]) -> list[Document]:
return [self._draw_boxes(d) for d in docs]