Source code for sycamore.transforms.random_sample
import random
from typing import Optional, TYPE_CHECKING
from sycamore.plan_nodes import Node, Transform
from sycamore.data import Document
if TYPE_CHECKING:
from ray.data import Dataset
[docs]
class RandomSample(Transform):
"""
Generates a random sample of documents in a collection.
Args:
child: The plan node providing the dataset.
fraction: The fraction of documents to retain.
seed: The seed to use to initialize the RNG.
resource_args: Additional resource-related arguments to pass to the execution env.
"""
def __init__(self, child: Node, fraction: float, seed: Optional[int] = None, **resource_args):
super().__init__(child, **resource_args)
self.fraction = fraction
self.seed = seed
def execute(self, **kwargs) -> "Dataset":
dataset = self.child().execute()
return dataset.random_sample(self.fraction, seed=self.seed)
def local_execute(self, all_docs: list[Document]) -> list[Document]:
if self.seed is not None:
random.seed(self.seed)
return random.sample(all_docs, int(len(all_docs) * self.fraction))