Source code for sycamore.functions.tokenizer
from abc import ABC, abstractmethod
from functools import cache
from typing import Union, Optional
[docs]
class Tokenizer(ABC):
def __init__(self, max_tokens: Optional[int] = None):
# TODO: Make max_tokens non-optional
self.max_tokens = max_tokens
@cache
@abstractmethod
def tokenize(self, text: str, as_ints: bool = False) -> Union[list[int], list[str]]:
pass
[docs]
class OpenAITokenizer(Tokenizer):
def __init__(self, model_name: str, max_tokens: Optional[int] = None):
import tiktoken
self._tk = tiktoken.encoding_for_model(model_name)
super().__init__(max_tokens)
@cache
def tokenize(self, text: str, as_ints: bool = False):
token_ids = self._tk.encode(text)
if as_ints:
return token_ids
tokens = self._tk.decode_batch([[id] for id in token_ids])
return tokens
[docs]
class CharacterTokenizer(Tokenizer):
@cache
def tokenize(self, text: str, as_ints: bool = False):
if as_ints:
return [ord(c) for c in text]
return list(text)
[docs]
class HuggingFaceTokenizer(Tokenizer):
def __init__(self, model_name: str):
from transformers import AutoTokenizer
self._tk = AutoTokenizer.from_pretrained(model_name)
super().__init__(max_tokens=self._tk.model_max_length)
@cache
def tokenize(self, text: str, as_ints: bool = False):
if as_ints:
return self._tk.encode(text)
return self._tk.tokenize(text)