Skip to content

guide

Guides to control generation in steerable models.

Logits processors rely on guides to control the generation process.

CFGGuide

Bases: Guide

Guide to generate text that is in the language of a context-free Lark grammar.

Source code in outlines/processors/guide.py
class CFGGuide(Guide):
    """Guide to generate text that is in the language of a context-free Lark
    grammar.

    """

    def __init__(self, cfg_string: str, tokenizer: "Tokenizer"):
        """
        Parameters
        ----------
        cfg_string
            The context-free grammar to generate text from.
        tokenizer
            The tokenizer to use to convert tokens to ids.

        """
        warnings.warn(
            "Outlines' public *community-contributed* CFG structured generation "
            "is experimental. Please review "
            "https://dottxt-ai.github.io/outlines/latest/reference/generation/cfg#disclaimer"
        )

        self.cfg_string = cfg_string
        self.tokenizer = tokenizer
        self.eos_token_id = self.tokenizer.eos_token_id
        self.parser = PartialLark(
            cfg_string,
            parser="lalr",
            import_paths=[grammars.GRAMMAR_PATH],
        )
        self.initial_state = CFGState(
            parser_state=self.parser.parse(""), prev_token=None
        )

    def get_next_instruction(self, state: CFGState) -> Instruction:
        """Return the next instruction for guided generation.

        Current lazy approach:
        - For each token in the vocabulary
          - create a copy of the parsers state
          - add the tokens to the parsers input text
          - if valid, add token to returned tokens

        Further refinements are necessary for performant text processing.

        Parameters
        ----------
        state
            The guides current PartialParserState, or None if complete

        Returns
        -------
        Instruction
            A `Generate` instance that contains the model and the allowed token
            ids.

        """
        import torch

        if state.parser_state is None:
            return Write(torch.tensor([self.eos_token_id]))

        valid_tokens = list(
            self.iter_valid_token_ids(
                state, self.tokenizer.vocabulary.values()
            )
        )

        if len(valid_tokens) == 1:
            return Write(torch.tensor(valid_tokens))

        return Generate(torch.tensor(valid_tokens))

    def iter_valid_token_ids(
        self, state: CFGState, candidate_token_ids: ValuesView[int]
    ) -> Generator[int, None, None]:
        """Iterate over the given token_ids and yield those that are valid for
        the current parser state.

        Parameters
        ----------
        parser_state
            The current state of the parser, or None if complete.
        token_ids
            The list of token ids to check for validity.

        Yields
        ------
        int
            Valid token ids.

        """
        for token_id in candidate_token_ids:
            if token_id == self.eos_token_id:
                if self.can_terminate_state(state):
                    yield token_id
            else:
                try:
                    self._get_parser_state_token_applied(state, int(token_id))
                    yield token_id
                except (
                    ValueError,
                    EOFError,
                    UnexpectedToken,
                    UnexpectedCharacters,
                    DedentError,
                ):
                    pass

    def get_next_state(self, state: CFGState, token_id: int) -> CFGState:
        """Update the state of the guide.

        Decode the token_id, and calculate the new parser_state with the token
        applied.

        Parameters
        ----------
        state
            The guides current PartialParserState, or None if complete
        token_id
            The id of the token that was just generated.

        Returns
        -------
        CFGState
            The guides new PartialParserState

        """
        if state.parser_state is None or token_id == self.eos_token_id:
            parser_state = None
        else:
            parser_state = self._get_parser_state_token_applied(state, int(token_id))
        return CFGState(parser_state=parser_state, prev_token=token_id)

    def _get_parser_state_token_applied(
        self, state: CFGState, token_id: int
    ) -> PartialParserState:
        """Apply the given token_id to the parser state.

        Don't mutate `parser_state`, copy to protect

        Get the token string
          - if first token in generation: tokenizer.decode (no leading whitespace)
          - else: normalized (with possibly leading whitespace)

        Don't allow empty ("") tokens, raise ValueError

        Parameters
        ----------
        state
            The guide's current PartialParserState, or None if complete
        token_id
            The id of the token that was just generated.

        Returns
        -------
        PartialParserState
            The parser state with the token applied.

        """
        parser_state = copy.copy(state.parser_state)  # prevent side effects

        # normalize
        if state.prev_token is None:
            new_token_str = self.tokenizer.decode([token_id])[0]
        else:
            prev_token_str = self.tokenizer.decode([[state.prev_token]])[0]
            combined_token_str = self.tokenizer.decode([[state.prev_token, token_id]])[
                0
            ]
            new_token_str = combined_token_str[len(prev_token_str) :]

        if new_token_str == "":
            raise ValueError("empty next token")

        # update parser with new token
        parser_state.lexer.state.text += new_token_str
        self.parser.parse_from_state(parser_state, is_end=False)

        return parser_state

    def is_final_state(self, state: CFGState) -> bool:
        """Return whether the given state is a final state.

        Parameters
        ----------
        state
            The guide's current state.

        Returns
        -------
        bool
            Whether the given state is a final state.

        """
        # TODO: remove this method, use can_terminate_state and
        # must_terminate_state here and in RegexGuide per
        # https://github.com/dottxt-ai/outlines/issues/885
        return self.can_terminate_state(state)

    def can_terminate_state(self, state: CFGState) -> bool:
        """Return whether generation is allowed to terminate.

        Parameters
        ----------
        state
            The guide's current state.

        Returns
        -------
        bool
            Whether generation is allowed to terminate.

        """
        if state.parser_state is not None:
            try:
                copy.copy(state.parser_state).feed_eof()
            except UnexpectedToken:
                return False
        return True

    def must_terminate_state(self, state: CFGState) -> bool:
        """Indicate whether generation must terminate as there are no legal
        continuations.

        Parameters
        ----------
        state
            The guide's current state.

        Returns
        -------
        bool
            Whether generation must terminate.

        """
        return (
            state.parser_state is None or
            set(state.parser_state.accepts()).issubset({"$END"})
        )

    def copy(self) -> "CFGGuide":
        """Create a copy of the Guide.

        Returns
        -------
        CFGGuide
            A copy of the Guide.

        """
        return CFGGuide(self.cfg_string, self.tokenizer)

__init__(cfg_string, tokenizer)

Parameters:

Name Type Description Default
cfg_string str

The context-free grammar to generate text from.

required
tokenizer Tokenizer

The tokenizer to use to convert tokens to ids.

required
Source code in outlines/processors/guide.py
def __init__(self, cfg_string: str, tokenizer: "Tokenizer"):
    """
    Parameters
    ----------
    cfg_string
        The context-free grammar to generate text from.
    tokenizer
        The tokenizer to use to convert tokens to ids.

    """
    warnings.warn(
        "Outlines' public *community-contributed* CFG structured generation "
        "is experimental. Please review "
        "https://dottxt-ai.github.io/outlines/latest/reference/generation/cfg#disclaimer"
    )

    self.cfg_string = cfg_string
    self.tokenizer = tokenizer
    self.eos_token_id = self.tokenizer.eos_token_id
    self.parser = PartialLark(
        cfg_string,
        parser="lalr",
        import_paths=[grammars.GRAMMAR_PATH],
    )
    self.initial_state = CFGState(
        parser_state=self.parser.parse(""), prev_token=None
    )

can_terminate_state(state)

Return whether generation is allowed to terminate.

Parameters:

Name Type Description Default
state CFGState

The guide's current state.

required

Returns:

Type Description
bool

Whether generation is allowed to terminate.

Source code in outlines/processors/guide.py
def can_terminate_state(self, state: CFGState) -> bool:
    """Return whether generation is allowed to terminate.

    Parameters
    ----------
    state
        The guide's current state.

    Returns
    -------
    bool
        Whether generation is allowed to terminate.

    """
    if state.parser_state is not None:
        try:
            copy.copy(state.parser_state).feed_eof()
        except UnexpectedToken:
            return False
    return True

copy()

Create a copy of the Guide.

Returns:

Type Description
CFGGuide

A copy of the Guide.

Source code in outlines/processors/guide.py
def copy(self) -> "CFGGuide":
    """Create a copy of the Guide.

    Returns
    -------
    CFGGuide
        A copy of the Guide.

    """
    return CFGGuide(self.cfg_string, self.tokenizer)

get_next_instruction(state)

Return the next instruction for guided generation.

Current lazy approach: - For each token in the vocabulary - create a copy of the parsers state - add the tokens to the parsers input text - if valid, add token to returned tokens

Further refinements are necessary for performant text processing.

Parameters:

Name Type Description Default
state CFGState

The guides current PartialParserState, or None if complete

required

Returns:

Type Description
Instruction

A Generate instance that contains the model and the allowed token ids.

Source code in outlines/processors/guide.py
def get_next_instruction(self, state: CFGState) -> Instruction:
    """Return the next instruction for guided generation.

    Current lazy approach:
    - For each token in the vocabulary
      - create a copy of the parsers state
      - add the tokens to the parsers input text
      - if valid, add token to returned tokens

    Further refinements are necessary for performant text processing.

    Parameters
    ----------
    state
        The guides current PartialParserState, or None if complete

    Returns
    -------
    Instruction
        A `Generate` instance that contains the model and the allowed token
        ids.

    """
    import torch

    if state.parser_state is None:
        return Write(torch.tensor([self.eos_token_id]))

    valid_tokens = list(
        self.iter_valid_token_ids(
            state, self.tokenizer.vocabulary.values()
        )
    )

    if len(valid_tokens) == 1:
        return Write(torch.tensor(valid_tokens))

    return Generate(torch.tensor(valid_tokens))

get_next_state(state, token_id)

Update the state of the guide.

Decode the token_id, and calculate the new parser_state with the token applied.

Parameters:

Name Type Description Default
state CFGState

The guides current PartialParserState, or None if complete

required
token_id int

The id of the token that was just generated.

required

Returns:

Type Description
CFGState

The guides new PartialParserState

Source code in outlines/processors/guide.py
def get_next_state(self, state: CFGState, token_id: int) -> CFGState:
    """Update the state of the guide.

    Decode the token_id, and calculate the new parser_state with the token
    applied.

    Parameters
    ----------
    state
        The guides current PartialParserState, or None if complete
    token_id
        The id of the token that was just generated.

    Returns
    -------
    CFGState
        The guides new PartialParserState

    """
    if state.parser_state is None or token_id == self.eos_token_id:
        parser_state = None
    else:
        parser_state = self._get_parser_state_token_applied(state, int(token_id))
    return CFGState(parser_state=parser_state, prev_token=token_id)

is_final_state(state)

Return whether the given state is a final state.

Parameters:

Name Type Description Default
state CFGState

The guide's current state.

required

Returns:

Type Description
bool

Whether the given state is a final state.

Source code in outlines/processors/guide.py
def is_final_state(self, state: CFGState) -> bool:
    """Return whether the given state is a final state.

    Parameters
    ----------
    state
        The guide's current state.

    Returns
    -------
    bool
        Whether the given state is a final state.

    """
    # TODO: remove this method, use can_terminate_state and
    # must_terminate_state here and in RegexGuide per
    # https://github.com/dottxt-ai/outlines/issues/885
    return self.can_terminate_state(state)

iter_valid_token_ids(state, candidate_token_ids)

Iterate over the given token_ids and yield those that are valid for the current parser state.

Parameters:

Name Type Description Default
parser_state

The current state of the parser, or None if complete.

required
token_ids

The list of token ids to check for validity.

required

Yields:

Type Description
int

Valid token ids.

Source code in outlines/processors/guide.py
def iter_valid_token_ids(
    self, state: CFGState, candidate_token_ids: ValuesView[int]
) -> Generator[int, None, None]:
    """Iterate over the given token_ids and yield those that are valid for
    the current parser state.

    Parameters
    ----------
    parser_state
        The current state of the parser, or None if complete.
    token_ids
        The list of token ids to check for validity.

    Yields
    ------
    int
        Valid token ids.

    """
    for token_id in candidate_token_ids:
        if token_id == self.eos_token_id:
            if self.can_terminate_state(state):
                yield token_id
        else:
            try:
                self._get_parser_state_token_applied(state, int(token_id))
                yield token_id
            except (
                ValueError,
                EOFError,
                UnexpectedToken,
                UnexpectedCharacters,
                DedentError,
            ):
                pass

must_terminate_state(state)

Indicate whether generation must terminate as there are no legal continuations.

Parameters:

Name Type Description Default
state CFGState

The guide's current state.

required

Returns:

Type Description
bool

Whether generation must terminate.

Source code in outlines/processors/guide.py
def must_terminate_state(self, state: CFGState) -> bool:
    """Indicate whether generation must terminate as there are no legal
    continuations.

    Parameters
    ----------
    state
        The guide's current state.

    Returns
    -------
    bool
        Whether generation must terminate.

    """
    return (
        state.parser_state is None or
        set(state.parser_state.accepts()).issubset({"$END"})
    )

Guide

Bases: Guide

Base definition of a generation guide.

A generation guide defines the behavior of a finite-state machine that guides a text generation procedure. Unlike the DFAs built from regular expressions guides, it can also emit a Write instructions which tells the model that it can append a sequence of tokens (or token word) instead of generating it.

Source code in outlines/processors/guide.py
class Guide(CoreGuide):
    """Base definition of a generation guide.

    A generation guide defines the behavior of a finite-state machine that
    guides a text generation procedure. Unlike the DFAs built from regular
    expressions guides, it can also emit a `Write` instructions which tells
    the model that it can append a sequence of tokens (or token word) instead
    of generating it.

    """
    initial_state: Any

RegexGuide

Bases: RegexGuide

Guide to generate text in the language of a regular expression.

This class is a wrapper around the CoreRegexGuide class that adds a cache to the create_states_mapping function.

Source code in outlines/processors/guide.py
class RegexGuide(CoreRegexGuide):
    """Guide to generate text in the language of a regular expression.

    This class is a wrapper around the CoreRegexGuide class that adds a cache
    to the create_states_mapping function.

    """

    @classmethod
    def from_regex(
        cls,
        regex_string: str,
        tokenizer,
        **kwargs,
    ):
        """Create a RegexGuide from a regular expression.

        Parameters
        ----------
        regex_string
            The regular expression to generate text from.
        tokenizer
            The tokenizer to use to convert tokens to ids.
        kwargs
            Additional keyword arguments to pass to the CoreRegexGuide constructor.

        Returns
        -------
        RegexGuide
            A RegexGuide instance.

        """
        return super().from_regex(
            regex_string,
            tokenizer,
            _create_states_mapping=cached_create_states_mapping,
            **kwargs,
        )

from_regex(regex_string, tokenizer, **kwargs) classmethod

Create a RegexGuide from a regular expression.

Parameters:

Name Type Description Default
regex_string str

The regular expression to generate text from.

required
tokenizer

The tokenizer to use to convert tokens to ids.

required
kwargs

Additional keyword arguments to pass to the CoreRegexGuide constructor.

{}

Returns:

Type Description
RegexGuide

A RegexGuide instance.

Source code in outlines/processors/guide.py
@classmethod
def from_regex(
    cls,
    regex_string: str,
    tokenizer,
    **kwargs,
):
    """Create a RegexGuide from a regular expression.

    Parameters
    ----------
    regex_string
        The regular expression to generate text from.
    tokenizer
        The tokenizer to use to convert tokens to ids.
    kwargs
        Additional keyword arguments to pass to the CoreRegexGuide constructor.

    Returns
    -------
    RegexGuide
        A RegexGuide instance.

    """
    return super().from_regex(
        regex_string,
        tokenizer,
        _create_states_mapping=cached_create_states_mapping,
        **kwargs,
    )

StopAtEOSGuide

Bases: Guide

Guide to generate tokens until the EOS token has been generated.

Source code in outlines/processors/guide.py
class StopAtEOSGuide(Guide):
    """Guide to generate tokens until the EOS token has been generated."""
    final_state = 1
    initial_state = 0

    def __init__(self, tokenizer: "Tokenizer"):
        """
        Parameters
        ----------
        tokenizer
            The tokenizer used to convert tokens to ids.

        """
        self.eos_token_id = tokenizer.eos_token_id
        self.vocabulary = tokenizer.vocabulary.values()

    def get_next_instruction(self, state: int) -> Instruction:
        """Return the next instruction.

        Parameters
        ----------
        state
            The guide's current state.

        Returns
        -------
        Instruction
            An `Instruction` instance.

        """
        if self.is_final_state(state):
            return Write([self.eos_token_id])
        return Generate(None)

    def get_next_state(self, state: int, token_id: int) -> int:
        """Return the next state.

        Parameters
        ----------
        state
            The guide's current state.
        token_id
            The id of the token that was just generated.

        Returns
        -------
        int
            The next state.

        """
        if token_id == self.eos_token_id or state == self.final_state:
            return self.final_state

        return self.initial_state

    def is_final_state(self, state: int) -> bool:
        """Return whether the given state is a final state.

        Parameters
        ----------
        state
            The guide's current state.

        Returns
        -------
        bool
            Whether the given state is a final state.

        """
        return state == self.final_state

    def copy(self) -> "StopAtEOSGuide":
        """Return itself as there is no need to copy."""
        return self

__init__(tokenizer)

Parameters:

Name Type Description Default
tokenizer Tokenizer

The tokenizer used to convert tokens to ids.

required
Source code in outlines/processors/guide.py
def __init__(self, tokenizer: "Tokenizer"):
    """
    Parameters
    ----------
    tokenizer
        The tokenizer used to convert tokens to ids.

    """
    self.eos_token_id = tokenizer.eos_token_id
    self.vocabulary = tokenizer.vocabulary.values()

copy()

Return itself as there is no need to copy.

Source code in outlines/processors/guide.py
def copy(self) -> "StopAtEOSGuide":
    """Return itself as there is no need to copy."""
    return self

get_next_instruction(state)

Return the next instruction.

Parameters:

Name Type Description Default
state int

The guide's current state.

required

Returns:

Type Description
Instruction

An Instruction instance.

Source code in outlines/processors/guide.py
def get_next_instruction(self, state: int) -> Instruction:
    """Return the next instruction.

    Parameters
    ----------
    state
        The guide's current state.

    Returns
    -------
    Instruction
        An `Instruction` instance.

    """
    if self.is_final_state(state):
        return Write([self.eos_token_id])
    return Generate(None)

get_next_state(state, token_id)

Return the next state.

Parameters:

Name Type Description Default
state int

The guide's current state.

required
token_id int

The id of the token that was just generated.

required

Returns:

Type Description
int

The next state.

Source code in outlines/processors/guide.py
def get_next_state(self, state: int, token_id: int) -> int:
    """Return the next state.

    Parameters
    ----------
    state
        The guide's current state.
    token_id
        The id of the token that was just generated.

    Returns
    -------
    int
        The next state.

    """
    if token_id == self.eos_token_id or state == self.final_state:
        return self.final_state

    return self.initial_state

is_final_state(state)

Return whether the given state is a final state.

Parameters:

Name Type Description Default
state int

The guide's current state.

required

Returns:

Type Description
bool

Whether the given state is a final state.

Source code in outlines/processors/guide.py
def is_final_state(self, state: int) -> bool:
    """Return whether the given state is a final state.

    Parameters
    ----------
    state
        The guide's current state.

    Returns
    -------
    bool
        Whether the given state is a final state.

    """
    return state == self.final_state

cached_create_states_mapping(regex_string, tokenizer, *args, **kwargs)

Wrap the uncached create_states_mapping function in a cache.

Source code in outlines/processors/guide.py
@cache()
def cached_create_states_mapping(regex_string, tokenizer, *args, **kwargs):
    """Wrap the uncached create_states_mapping function in a cache."""
    return uncached_create_states_mapping(
        regex_string, tokenizer, *args, **kwargs
    )