Source code for sycamore.functions.chunker
from abc import abstractmethod
from typing import Any
class Chunker:
@abstractmethod
def chunk(self, tokens: list[Any]) -> list[Any]:
pass
[docs]
class TextOverlapChunker(Chunker):
"""
TextOverlapChunker is a class for chunking text into smaller segments while allowing for token overlap.
This class inherits from the Chunker class and is designed to divide long text tokens into chunks, each containing
a specified number of tokens. It allows for a controlled overlap of tokens between adjacent chunks.
Args:
chunk_token_count: The maximum number of tokens to include in each chunk.
chunk_overlap_token_count: The number of tokens that can overlap between adjacent chunks.
This value must be less than the `chunk_token_count` to ensure meaningful chunking.
Example:
.. code-block:: python
chunker = TextOverlapChunker(chunk_token_count=1000, chunk_overlap_token_count=100)
chunks = chunker.chunk(data)
"""
def __init__(self, chunk_token_count: int = 1000, chunk_overlap_token_count: int = 100) -> None:
super().__init__()
if chunk_overlap_token_count >= chunk_token_count:
raise Exception("Token overlap count between chunks must be lesser than chunk token count")
self._chunk_token_count = chunk_token_count
self._chunk_overlap_token_count = chunk_overlap_token_count
def chunk(self, tokens: list[Any]) -> list[Any]:
return [
tokens[a : a + self._chunk_token_count]
for a in range(0, len(tokens), self._chunk_token_count - self._chunk_overlap_token_count)
]