Source code for sycamore.transforms.extract_table_properties

import json
from typing import Optional, Union

from sycamore.data import Document
from sycamore.plan_nodes import Node, SingleThreadUser, NonGPUUser
from sycamore.transforms.map import Map
from sycamore.utils.time_trace import timetrace
from sycamore.llms import LLM
from sycamore.llms.prompts.default_prompts import ExtractTablePropertiesPrompt
from PIL import Image
from sycamore.functions.document import split_and_convert_to_image



[docs]
class ExtractTableProperties(SingleThreadUser, NonGPUUser, Map):
    """
    The ExtractTableProperties transform extracts key-value pairs from tables and adds them as
    properties to the table. It only processes tables that are one level deep.

    Args:
        child: The source node or component that provides the hierarchical documents for extracting table property.
        resource_args: Additional resource-related arguments that can be passed to the extract operation.

    Example:
        .. code-block:: python

            source_node = ...  # Define a source node or component that provides hierarchical documents.
            llm = openAI('gpt-4o-mini')
            property_extract = ExtractKeyValuePair(child=source_node, list=["property_name",llm])
            property_dataset = property_extract.execute()
    """

    def __init__(self, child: Node, parameters: list[Union[str, LLM]], **resource_args):
        super().__init__(child, f=ExtractTableProperties.extract_table_properties, args=parameters, **resource_args)


[docs]
    @staticmethod
    def extract_parent_json(input_string: str) -> str:
        """
        Extracts the top level JSONstring from input String.
        """
        stack: list[str] = []
        json_start = None
        json_str = ""

        for i, char in enumerate(input_string):
            if char == "{":
                if not stack:
                    json_start = i
                stack.append(char)
            elif char == "}":
                stack.pop()
                if not stack:
                    json_end = i + 1
                    json_str = input_string[json_start:json_end]
        return json_str



[docs]
    @staticmethod
    @timetrace("ExtrKeyVal")
    def extract_table_properties(
        parent: Document,
        property_name: str,
        llm: LLM,
        prompt_find_table: Optional[str] = None,
        prompt_LLM: Optional[str] = None,
    ) -> Document:
        """
        This method is used to extract key/value pairs from tables, using the LLM,
        and populate them as a property of that element.
        """
        image_doc = split_and_convert_to_image(parent)
        img_list = []
        for img in image_doc:
            # print(img['properties'])
            size = tuple(img.properties["size"])
            mode = img.properties["mode"]
            image = Image.frombytes(mode=mode, size=size, data=img.binary_representation)
            img_list.append((image, size, mode))

        for idx, ele in enumerate(parent.elements):
            raw_answer = ""
            if ele is not None and ele.type == "table" and ele.bbox is not None:
                image, size, mode = img_list[ele.properties["page_number"] - 1]  # output of APS is one indexed
                bbox = ele.bbox.coordinates
                img = image.crop((bbox[0] * size[0], bbox[1] * size[1], bbox[2] * size[0], bbox[3] * size[1]))
                content = [
                    {
                        "type": "text",
                        "text": (
                            prompt_LLM
                            if prompt_LLM is not None
                            else (
                                ExtractTablePropertiesPrompt.user + f"\n CSV: {ele.text_representation}"  # type: ignore
                            )  # type ignore - thinks ETPP.user could be None
                        ),
                    },
                    llm.format_image(img),
                ]
                messages = [
                    {"role": "user", "content": content},
                ]
                prompt_kwargs = {"messages": messages}
                raw_answer = llm.generate_old(prompt_kwargs=prompt_kwargs, llm_kwargs={})
            parsed_json = ExtractTableProperties.extract_parent_json(raw_answer)
            if parsed_json:
                ele.properties[property_name] = json.loads(parsed_json)
        return parent