Source code for sycamore.transforms.mark_misc

from typing import Optional

from sycamore.data import Document
from sycamore.data.document import DocumentPropertyTypes
from sycamore.functions.tokenizer import Tokenizer
from sycamore.plan_nodes import Node, SingleThreadUser, NonGPUUser
from sycamore.transforms import Map
from sycamore.utils.time_trace import timetrace

# TODO:
# - make breaks balanced in size
# - maybe move token counting elsewhere to avoid duplicate work


[docs] class MarkDropTiny(SingleThreadUser, NonGPUUser, Map): """ MarkDropTiny is a transform to add the '_drop' data attribute to each Element smaller than a certain size. Args: child: The source Node or component that provides the Elements minimum: The smallest Element to keep (def 2) Example: .. code-block:: python source_node = ... marker = MarkDropTiny(child=source_node, minimum=2) dataset = marker.execute() """ def __init__(self, child: Node, minimum: int = 2, **resource_args): super().__init__(child, f=MarkDropTiny.mark_drop_tiny, args=[minimum], **resource_args) @staticmethod @timetrace("markDropTiny") def mark_drop_tiny(parent: Document, minimum) -> Document: for elem in parent.elements: tr = elem.text_representation or "" if len(tr) < minimum: elem.data["_drop"] = True # remove specks return parent
###############################################################################
[docs] class MarkBreakPage(SingleThreadUser, NonGPUUser, Map): """ MarkBreakPage is a transform to add the '_break' data attribute to each Element when the 'page_number' property changes. Args: child: The source Node or component that provides the Elements Example: .. code-block:: python source_node = ... marker = MarkBreakPage(child=source_node) dataset = marker.execute() """ def __init__(self, child: Node, **resource_args): super().__init__(child, f=MarkBreakPage.mark_break_page, **resource_args) @staticmethod @timetrace("markBreakPage") def mark_break_page(parent: Document) -> Document: if len(parent.elements) > 1: last = parent.elements[0].properties[DocumentPropertyTypes.PAGE_NUMBER] for elem in parent.elements: page = elem.properties[DocumentPropertyTypes.PAGE_NUMBER] if page != last: elem.data["_break"] = True # mark for later last = page return parent
###############################################################################
[docs] class MarkBreakByTokens(SingleThreadUser, NonGPUUser, Map): """ MarkBreakByTokens is a transform to add the '_break' data attribute to each Element when the number of tokens exceeds the limit. This should most likely be the last marking operation before final merge. Args: child: The source Node or component that provides the Elements tokenizer: the tokenizer that will be used for embedding limit: maximum permitted number of tokens Example: .. code-block:: python source_node = ... tokenizer = OpenAITokenizer("text-embedding-3-small") marker = MarkBreakByTokens(child=source_node, tokenizer=tokenizer, limit=512) dataset = marker.execute() """ def __init__(self, child: Node, tokenizer: Tokenizer, limit: int = 512, **resource_args): super().__init__(child, f=MarkBreakByTokens.mark_break_by_tokens, args=[tokenizer, limit], **resource_args) @staticmethod @timetrace("markBreakToks") def mark_break_by_tokens(parent: Document, tokenizer: Tokenizer, limit: int) -> Document: toks = 0 for elem in parent.elements: if elem.text_representation: n = len(tokenizer.tokenize(elem.text_representation)) else: n = 0 elem.data["_tokCnt"] = n if elem.data.get("_break") or ((toks + n) > limit): elem.data["_break"] = True toks = 0 toks += n return parent
############################################################################### class MarkBboxPreset(SingleThreadUser, NonGPUUser, Map): """ See DocSet.mark_bbox_preset for details. """ def __init__(self, child: Node, tokenizer: Tokenizer, token_limit: int = 512, **resource_args): super().__init__(child, f=MarkBboxPreset.mark_bbox_preset, args=[tokenizer, token_limit], **resource_args) @staticmethod def mark_bbox_preset(parent: Document, tokenizer: Optional[Tokenizer], token_limit: int = 512) -> Document: from sycamore.transforms.bbox_merge import MarkDropHeaderFooter, SortByPageBbox, MarkBreakByColumn from sycamore.functions.tokenizer import OpenAITokenizer if not tokenizer: tokenizer = OpenAITokenizer("text-embedding-3-small") SortByPageBbox.sort_by_page_bbox(parent) MarkDropTiny.mark_drop_tiny(parent, 2) MarkDropHeaderFooter.mark_drop_header_and_footer(parent, 0.05, 0.05) MarkBreakPage.mark_break_page(parent) MarkBreakByColumn.mark_break_by_column(parent) MarkBreakByTokens.mark_break_by_tokens(parent, tokenizer, token_limit) return parent