Source code for sycamore.transforms.extract_table_properties

import json
from typing import Optional, Union

from sycamore.data import Document
from sycamore.plan_nodes import Node, SingleThreadUser, NonGPUUser
from sycamore.transforms.map import Map
from sycamore.utils.time_trace import timetrace
from sycamore.llms import LLM
from sycamore.llms.prompts import ExtractTablePropertiesPrompt
from PIL import Image
from sycamore.functions.document import split_and_convert_to_image


[docs] class ExtractTableProperties(SingleThreadUser, NonGPUUser, Map): """ The ExtractTableProperties transform extracts key-value pairs from tables and adds them as properties to the table. It only processes tables that are one level deep. Args: child: The source node or component that provides the hierarchical documents for extracting table property. resource_args: Additional resource-related arguments that can be passed to the extract operation. Example: .. code-block:: python source_node = ... # Define a source node or component that provides hierarchical documents. llm = openAI('gpt-4o-mini') property_extract = ExtractKeyValuePair(child=source_node, list=["property_name",llm]) property_dataset = property_extract.execute() """ def __init__(self, child: Node, parameters: list[Union[str, LLM]], **resource_args): super().__init__(child, f=ExtractTableProperties.extract_table_properties, args=parameters, **resource_args)
[docs] @staticmethod def extract_parent_json(input_string: str) -> str: """ Extracts the top level JSONstring from input String. """ stack: list[str] = [] json_start = None json_str = "" for i, char in enumerate(input_string): if char == "{": if not stack: json_start = i stack.append(char) elif char == "}": stack.pop() if not stack: json_end = i + 1 json_str = input_string[json_start:json_end] return json_str
[docs] @staticmethod @timetrace("ExtrKeyVal") def extract_table_properties( parent: Document, property_name: str, llm: LLM, prompt_find_table: Optional[str] = None, prompt_LLM: Optional[str] = None, ) -> Document: """ This method is used to extract key/value pairs from tables, using the LLM, and populate them as a property of that element. """ image_doc = split_and_convert_to_image(parent) img_list = [] for img in image_doc: # print(img['properties']) size = tuple(img.properties["size"]) mode = img.properties["mode"] image = Image.frombytes(mode=mode, size=size, data=img.binary_representation) img_list.append((image, size, mode)) for idx, ele in enumerate(parent.elements): raw_answer = "" if ele is not None and ele.type == "table" and ele.bbox is not None: image, size, mode = img_list[ele.properties["page_number"] - 1] # output of APS is one indexed bbox = ele.bbox.coordinates img = image.crop((bbox[0] * size[0], bbox[1] * size[1], bbox[2] * size[0], bbox[3] * size[1])) content = [ { "type": "text", "text": ( prompt_LLM if prompt_LLM is not None else ( ExtractTablePropertiesPrompt.user + f"\n CSV: {ele.text_representation}" # type: ignore ) # type ignore - thinks ETPP.user could be None ), }, llm.format_image(img), ] messages = [ {"role": "user", "content": content}, ] prompt_kwargs = {"messages": messages} raw_answer = llm.generate_old(prompt_kwargs=prompt_kwargs, llm_kwargs={}) parsed_json = ExtractTableProperties.extract_parent_json(raw_answer) if parsed_json: ele.properties[property_name] = json.loads(parsed_json) return parent