Source code for sycamore.transforms.standardizer

from abc import ABC, abstractmethod
from datetime import datetime
import re
from typing import Any, List, Optional

import dateparser
from sycamore.plan_nodes import Node
from sycamore.data import Document
from sycamore.transforms.map import Map

import logging

logger = logging.getLogger(__name__)


[docs] class Standardizer(ABC): """ An abstract base class for implementing standardizers, which are responsible for transforming specific fields within a document according to certain rules. """
[docs] @abstractmethod def fixer(self, text: str) -> Any: """ Abstract method to be implemented by subclasses to define how the relevant values should be standardized. Args: text (str): The text or date string to be standardized. Returns: A standardized value. """ pass
[docs] @abstractmethod def standardize(self, doc: Document, key_path: List[str]) -> Document: """ Abstract method applies the fixer method to a specific field in the document as defined by the key_path. Args: doc (Document): The document to be standardized. key_path (List[str]): The path to the field within the document that should be standardized. Returns: Document: The document with the standardized field. Raises: KeyError: If any of the keys in key_path are not found in the document. """ pass
[docs] class USStateStandardizer(Standardizer): """ A standardizer for transforming US state abbreviations in text to their full state names. Transforms substrings matching a state abbreviation to the full state name. Example: .. code-block:: python source_docset = ... # Define a source node or component that provides hierarchical documents. transformed_docset = source_docset.map( lambda doc: USStateStandardizer.standardize( doc, key_path = ["path","to","location"])) """ state_abbreviations = { "AK": "Alaska", "AL": "Alabama", "AR": "Arkansas", "AZ": "Arizona", "CA": "California", "CO": "Colorado", "CT": "Connecticut", "DC": "District of Columbia", "DE": "Delaware", "FL": "Florida", "GA": "Georgia", "HI": "Hawaii", "IA": "Iowa", "ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "MA": "Massachusetts", "MD": "Maryland", "ME": "Maine", "MI": "Michigan", "MN": "Minnesota", "MO": "Missouri", "MS": "Mississippi", "MT": "Montana", "NC": "North Carolina", "ND": "North Dakota", "NE": "Nebraska", "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico", "NV": "Nevada", "NY": "New York", "OH": "Ohio", "OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina", "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VA": "Virginia", "VT": "Vermont", "WA": "Washington", "WI": "Wisconsin", "WV": "West Virginia", "WY": "Wyoming", }
[docs] @staticmethod def fixer(text: str) -> str: """ Replaces any US state abbreviations in the text with their full state names. Args: text (str): The text containing US state abbreviations. Returns: str: The text with state abbreviations replaced by full state names. """ def replacer(match): abbreviation = match.group(0) return USStateStandardizer.state_abbreviations.get(abbreviation, abbreviation) return re.sub(r"\b[A-Z]{2}\b", replacer, text)
[docs] @staticmethod def standardize(doc: Document, key_path: List[str]) -> Document: """ Applies the fixer method to a specific field in the document as defined by the key_path. Args: doc (Document): The document to be standardized. key_path (List[str]): The path to the field within the document that should be standardized. Returns: Document: The document with the standardized field. Raises: KeyError: If any of the keys in key_path are not found in the document. """ current = doc for key in key_path[:-1]: if current.get(key, None): current = current[key] else: raise KeyError(f"Key {key} not found in the dictionary among {current.keys()}") target_key = key_path[-1] if current.get(target_key, None): current[target_key] = USStateStandardizer.fixer(current[target_key]) else: raise KeyError(f"Key {target_key} not found in the dictionary among {current.keys()}") return doc
[docs] class DateTimeStandardizer(Standardizer): """ A standardizer for transforming date and time strings into a consistent format. Example: .. code-block:: python source_docset = ... # Define a source node or component that provides hierarchical documents. transformed_docset = source_docset.map( lambda doc: USStateStandardizer.standardize( doc, key_path = ["path","to","datetime"])) """ DEFAULT_FORMAT = "%B %d, %Y %H:%M:%S%Z" # Regexes for military time stuff below. Example matching strings: # clock: 8:00 12:30 23:59:59 # year: 1970-04-30 1999-12 12/5/2024 12/2000 4/30/70 # digitpair: 0800 235959 clock_re = re.compile(r"\d:[0-5]\d") year_re = re.compile(r"([12]\d\d\d-)|(/[12]\d\d\d)|(\d/[0-3]?\d/\d)") digitpair_re = re.compile(r"([0-2]\d)([0-5]\d)(\d\d)?")
[docs] @staticmethod def fixer(raw_dateTime: str) -> datetime: """ Standardize a date-time string by parsing it into a datetime object. Args: raw_dateTime (str): The raw date-time string to be standardized. format: Optional[str]: strftime-compatible format string to render the datetime. Returns: Tuple[str, date]: A tuple containing the standardized date-time string and the corresponding datetime object. Raises: ValueError: If the input string cannot be parsed into a valid date-time. RuntimeError: For any other unexpected errors during the processing. """ assert raw_dateTime is not None, "raw_dateTime is None" try: raw_dateTime = DateTimeStandardizer.fix_military(raw_dateTime) raw_dateTime = raw_dateTime.replace("Local", "") raw_dateTime = raw_dateTime.replace("local", "") raw_dateTime = raw_dateTime.replace(".", ":") parsed = dateparser.parse(raw_dateTime) if not parsed: raise ValueError(f"Invalid date format: {raw_dateTime}") return parsed except ValueError as e: # Handle errors related to value parsing raise ValueError(f"Invalid date format: {raw_dateTime}") from e except Exception as e: # Handle any other exceptions raise RuntimeError(f"Unexpected error occurred while processing: {raw_dateTime}") from e
@staticmethod def fix_military(raw: str) -> str: # Fix up military clock time with just digits (0800) raw = raw.strip() tokens = raw.split() saw_clock = 0 saw_year = 0 saw_digits = 0 for token in tokens: if DateTimeStandardizer.clock_re.search(token): saw_clock += 1 elif DateTimeStandardizer.year_re.search(token): saw_year += 1 elif DateTimeStandardizer.digitpair_re.fullmatch(token): saw_digits += 1 # If unsure there's exactly one military clock time, bail out. # Note that numbers like 2024 could be times or years. if (saw_clock > 0) or (saw_year == 0) or (saw_digits != 1): return raw pieces: list[str] = [] for token in tokens: if match := DateTimeStandardizer.digitpair_re.fullmatch(token): clock = ":".join([x for x in match.groups() if x]) before = token[: match.start(0)] after = token[match.end(0) :] token = before + clock + after pieces.append(token) return " ".join(pieces)
[docs] @staticmethod def standardize( doc: Document, key_path: List[str], add_day: bool = True, add_dateTime: bool = True, date_format: Optional[str] = None, ) -> Document: """ Applies the fixer method to a specific date-time field in the document as defined by the key_path. Args: doc (Document): The document to be standardized. key_path (List[str]): The path to the date-time field within the document that should be standardized. add_day (bool): Whether to add a "day" field to the document with the date extracted from the standardized date-time field. Will not overwrite an existing "day" field. add_dateTime (bool): Whether to add a "dateTime" field to the document with the standardized standardized date-time field. Will not overwrite an existing "dateTime" field. date_format (Optional[str]): strftime-compatible format string to render the datetime. Returns: Document: The document with the standardized date-time field and an additional "day" field. Raises: KeyError: If any of the keys in key_path are not found in the document. """ current = doc for key in key_path[:-1]: if key in current.keys(): current = current[key] else: raise KeyError(f"Key {key} not found in the dictionary among {current.keys()}") target_key = key_path[-1] if target_key not in current.keys(): raise KeyError(f"Key {target_key} not found in the dictionary among {current.keys()}") if current[target_key] is None: raise KeyError(f"Key {target_key} has value None") parsed = DateTimeStandardizer.fixer(current[target_key]) rendered = parsed.strftime(date_format or DateTimeStandardizer.DEFAULT_FORMAT) current[target_key] = rendered if add_dateTime and "dateTime" not in current.keys(): current["dateTime"] = parsed if add_day and "day" not in current.keys(): current["day"] = parsed.date() return doc
class StandardizeProperty(Map): """ A class for applying a standardizer to a specific property of documents in a dataset. This class allows for the execution of standardization logic, either for location or date-time properties, across a set of documents by utilizing a specified standardizer and path. """ def __init__( self, child: Node, standardizer: Standardizer, path: list[str], **kwargs, ): super().__init__(child, f=standardizer.standardize, args=path, kwargs=kwargs) def ignore_errors(doc: Document, standardizer: Standardizer, key_path: list[str]) -> Document: """ A class for applying the behavior of a standardizer to log errors and continue when encountering null values. This class allows for the execution of standardization logic not to fail when encountering null key:value pairs. It will instead log a warning stating what key:value pairs in what documents were missing. Example: .. code-block:: python docset.map(lambda doc: ignore_errors(doc, DateTimeStandardizer, ["properties", "entity", "dateAndTime"]) """ try: doc = standardizer.standardize(doc, key_path=key_path) except KeyError: logger.warning(f"Key {key_path} not found in document: {doc}") except Exception as e: logger.error(e) return doc