Source code for sycamore.transforms.bbox_merge

from typing import Optional


from sycamore.data import Document, Element
from sycamore.data.document import DocumentPropertyTypes
from sycamore.plan_nodes import Node, SingleThreadUser, NonGPUUser
from sycamore.transforms.map import Map
from sycamore.utils.time_trace import TimeTrace, timetrace


def validBbox(bbox):
    for idx in range(4):
        val = bbox[idx]
        if (val < 0.0) or (val > 1.0):
            return False
    return True


def getBboxLeftTop(elem: Element):
    bbox = elem.data.get("bbox")
    if bbox is None:
        return (0.0, 0.0)
    else:
        return (bbox[0], bbox[1])


def getPageTopLeft(elem: Element):
    bbox = elem.data.get("bbox")
    if bbox is None:
        return (elem.properties[DocumentPropertyTypes.PAGE_NUMBER], 0.0, 0.0)
    else:
        return (elem.properties[DocumentPropertyTypes.PAGE_NUMBER], bbox[1], bbox[0])


def getRow(elem: Element, elements: list[Element]) -> list[Element]:
    rv = [elem]

    bbox = elem.data.get("bbox")
    if bbox is None:
        return rv
    left = bbox[0]
    top = bbox[1]
    right = bbox[2]
    bottom = bbox[3]
    page = elem.properties[DocumentPropertyTypes.PAGE_NUMBER]

    # !!! assuming elements are sorted by y-values
    n = len(elements)
    beg = 0
    end = n
    idx = 0
    while beg < end:
        mid = beg + ((end - beg) // 2)
        melem = elements[mid]
        mpage = melem.properties[DocumentPropertyTypes.PAGE_NUMBER]
        if mpage < page:
            beg = mid + 1
            idx = mid
        elif mpage > page:
            end = mid
        else:
            mbb = melem.data["bbox"]
            mtop = mbb[1]
            if mtop < top:
                beg = mid + 1
                idx = mid
            elif mtop > top:
                end = mid
            else:
                break

    for idx in range(idx, n):
        ee = elements[idx]
        bb = ee.data["bbox"]
        if bb[1] > bottom:
            break
        if bb[3] < top:
            continue
        if (bb[0] > right) or (bb[2] < left):
            rv.append(ee)

    rv.sort(key=getBboxLeftTop)
    return rv


def partOfTwoCol(elem: Element, xmin, xmax) -> bool:
    cc = elem.data.get("_colCnt")
    if (cc is None) or (cc != 2):
        return False
    bb = elem.data.get("bbox")
    if bb is None:
        return False
    left = bb[0]
    width = bb[2] - left
    pageWidth = xmax - xmin
    halfWidth = pageWidth / 2
    if width > halfWidth:
        return False
    frac = (left - xmin) / pageWidth
    return (frac <= 0.1) or ((frac >= 0.45) and (frac <= 0.6))


###############################################################################



[docs]
class SortByPageBbox(SingleThreadUser, NonGPUUser, Map):
    """
    SortByPageBbox is a transform to reorder the Elements
    in 'natural order', top to bottom using page_number and bbox.

    Args:
        child: The source Node or component that provides the Elements

    Example:
        .. code-block:: python

            source_node = ...
            sorter = SortByPageBbox(child=source_node)
            dataset = sorter.execute()
    """

    def __init__(self, child: Node, **resource_args):
        super().__init__(child, f=SortByPageBbox.sort_by_page_bbox, **resource_args)

    @staticmethod
    def sort_by_page_bbox(parent: Document) -> Document:
        with TimeTrace("sortPageBbox"):
            parent.elements.sort(key=getPageTopLeft)
            return parent



###############################################################################



[docs]
class MarkDropHeaderFooter(SingleThreadUser, NonGPUUser, Map):
    """
    MarkDropHeaderFooter is a transform to add the '_drop' data attribute to
    each Element at the top or bottom X fraction of the page.  Requires
    the 'bbox' attribute.

    Args:
        child: The source Node or component that provides the Elements
        top: The fraction of the page to exclude from the top (def 0.05)
        bottom: The fraction of the page to exclude from the bottom (0.05)

    Example:
        .. code-block:: python

            source_node = ...
            marker = MarkDropHeaderFooter(child=source_node, top=0.05)
            dataset = marker.execute()
    """

    def __init__(self, child: Node, top: float = 0.05, bottom: Optional[float] = None, **resource_args):
        super().__init__(child, f=MarkDropHeaderFooter.mark_drop_header_and_footer, args=[top, bottom], **resource_args)

    @staticmethod
    @timetrace("markHeadFoot")
    def mark_drop_header_and_footer(parent: Document, top: float = 0.05, bottom: Optional[float] = None) -> Document:
        if bottom is None:
            bottom = top

        lo = top
        hi = 1.0 - bottom

        for elem in parent.elements:
            bbox = elem.data.get("bbox")
            if (bbox is not None) and ((bbox[1] > hi) or (bbox[3] < lo)):
                elem.data["_drop"] = True  # mark for removal
        return parent



###############################################################################



[docs]
class MarkBreakByColumn(SingleThreadUser, NonGPUUser, Map):
    """
    MarkBreakByColumn is a transform that marks '_break' where
    two-column layout changes to full-width layout.  Ranges of two-
    column Elements are also re-sorted left to right.  Elements must
    already be sorted top-to-bottom.

    Args:
        child: The source Node or component that provides the Elements

    Example:
        .. code-block:: python

            source_node = ...
            marker = MarkBreakByColumn(child=source_node)
            dataset = marker.execute()
    """

    def __init__(self, child: Node, **resource_args):
        super().__init__(child, f=MarkBreakByColumn.mark_break_by_column, **resource_args)

    @staticmethod
    @timetrace("makeBreakCol")
    def mark_break_by_column(parent: Document) -> Document:
        elements = parent.elements

        # measure width in-use
        xmin = 1.0  # FIXME are these global?
        xmax = 0.0
        for elem in elements:
            bbox = elem.data.get("bbox")
            if (bbox is not None) and validBbox(bbox):
                xmin = min(xmin, bbox[0])
                xmax = max(xmax, bbox[2])
        if xmin < xmax:
            fullWidth = (xmax - xmin) * 0.8  # fudge
        else:
            fullWidth = 0.8

        # tag elements by column
        for elem in elements:
            if elem.data.get("_colIdx") is None:
                row = getRow(elem, elements)
                if len(row) == 1:
                    bbox = elem.data.get("bbox")
                    if bbox is None:
                        width = 0.0
                    else:
                        width = bbox[2] - bbox[0]
                    if width > fullWidth:
                        cnt = 0  # signal full-width
                    else:
                        cnt = 1
                    elem.data["_colIdx"] = 0
                    elem.data["_colCnt"] = cnt
                else:
                    idx = -1
                    last = 0.0
                    for ee in row:
                        bbox = ee.data["bbox"]
                        if bbox[0] >= last:  # may be stacked vertically
                            idx += 1
                        last = bbox[2]
                        if ee.data.get("_colIdx") is None:
                            ee.data["_colIdx"] = idx
                    for ee in row:
                        if ee.data.get("_colCnt") is None:
                            ee.data["_colCnt"] = idx + 1

        # re-sort ranges of two-column text
        last = 0
        ranges = []
        for idx, elem in enumerate(elements):
            if not partOfTwoCol(elem, xmin, xmax):
                if (idx - last) > 4:
                    ranges.append((last + 1, idx))
                last = idx
        for xx, yy in ranges:
            elements[xx:yy] = sorted(elements[xx:yy], key=getBboxLeftTop)

        # mark breaks due to column transitions
        lastCols = 0
        for elem in elements:
            ecols = elem.data["_colCnt"]
            if ecols != lastCols:
                if ecols == 0:
                    elem.data["_break"] = True
                lastCols = ecols

        return parent