Source code for sycamore.transforms.regex_replace

import re


from sycamore.data import Document
from sycamore.plan_nodes import Node, SingleThreadUser, NonGPUUser
from sycamore.transforms.map import Map
from sycamore.utils.time_trace import timetrace

COALESCE_WHITESPACE = [
    (r"\s+", " "),
    (r"^ ", ""),
    (r" $", ""),
]



[docs]
class RegexReplace(SingleThreadUser, NonGPUUser, Map):
    """
    The RegexReplace transform modifies the text_representation in each
    Element in every Document.

    Args:
        child: The source node or component that provides the documents
        spec: A list of tuples of regular expressions and substitutions,
              to be executed in order via re.sub()
        kwargs: Additional resource-related arguments that can be passed to the operation

    Example:
        .. code-block:: python

            rr = RegexReplace(child=node, spec=[(r"\\s+", " "), (r"^ ", "")])
            dataset = rr.execute()
    """

    def __init__(self, child: Node, spec: list[tuple[str, str]], **kwargs):
        try:
            for x, y in spec:  # make sure it's iterable as pairs
                s = str()
                s += x  # only strings can be added to strings
                s += y
        except Exception:
            raise TypeError("RegexReplace spec is not list[tuple[str, str]]")

        compiled = []
        for exp, repl in spec:
            pat = re.compile(exp)
            compiled.append((pat, repl))

        @timetrace("regexRepl")
        def regex_replace(doc: Document) -> Document:
            for elem in doc.elements:
                txt = elem.text_representation
                if txt is not None:
                    for rex, repl in compiled:
                        txt = rex.sub(repl, txt)
                    elem.text_representation = txt
                    elem.binary_representation = txt.encode("utf-8")
            return doc

        super().__init__(child, f=regex_replace, **kwargs)