Skip to content

tokenizer

Tokenizer

Bases: Hashable, Protocol

Source code in outlines/models/tokenizer.py
class Tokenizer(Hashable, Protocol):
    eos_token: str
    eos_token_id: int
    pad_token_id: int
    vocabulary: Dict[str, int]
    special_tokens: Set[str]

    def encode(
        self, prompt: Union[str, List[str]]
    ) -> "Tuple['NDArray[np.int64]', 'NDArray[np.int64]']":
        """Translate the input prompts into arrays of token ids and attention mask."""
        ...

    def decode(self, token_ids: "NDArray[np.int64]") -> List[str]:
        """Translate an array of token ids to a string or list of strings."""
        ...

    def convert_token_to_string(self, token: str) -> str:
        """Convert a token to its equivalent string.

        This is for instance useful for BPE tokenizers where whitespaces are
        represented by the special characted `Ġ`. This prevents matching a raw
        token that includes `Ġ` with a string.
        """
        ...

convert_token_to_string(token)

Convert a token to its equivalent string.

This is for instance useful for BPE tokenizers where whitespaces are represented by the special characted Ġ. This prevents matching a raw token that includes Ġ with a string.

Source code in outlines/models/tokenizer.py
def convert_token_to_string(self, token: str) -> str:
    """Convert a token to its equivalent string.

    This is for instance useful for BPE tokenizers where whitespaces are
    represented by the special characted `Ġ`. This prevents matching a raw
    token that includes `Ġ` with a string.
    """
    ...

decode(token_ids)

Translate an array of token ids to a string or list of strings.

Source code in outlines/models/tokenizer.py
def decode(self, token_ids: "NDArray[np.int64]") -> List[str]:
    """Translate an array of token ids to a string or list of strings."""
    ...

encode(prompt)

Translate the input prompts into arrays of token ids and attention mask.

Source code in outlines/models/tokenizer.py
def encode(
    self, prompt: Union[str, List[str]]
) -> "Tuple['NDArray[np.int64]', 'NDArray[np.int64]']":
    """Translate the input prompts into arrays of token ids and attention mask."""
    ...