Skip to content

outlines

Outlines is a Generative Model Programming Framework.

Anthropic

Bases: Model

Thin wrapper around the anthropic.Anthropic client.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the anthropic.Anthropic client.

Source code in outlines/models/anthropic.py
class Anthropic(Model):
    """Thin wrapper around the `anthropic.Anthropic` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `anthropic.Anthropic` client.

    """
    def __init__(
        self, client: "AnthropicClient", model_name: Optional[str] = None
    ):
        """
        Parameters
        ----------
        client
            An `anthropic.Anthropic` client.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = AnthropicTypeAdapter()

    def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> str:
        """Generate text using Anthropic.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            As structured generation is not supported by Anthropic, the value
            of this argument must be `None`. Otherwise, an error will be
            raised at runtime.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The response generated by the model.

        """
        messages = self.type_adapter.format_input(model_input)

        if output_type is not None:
            raise NotImplementedError(
                f"The type {output_type} is not available with Anthropic."
            )

        if (
            "model" not in inference_kwargs
            and self.model_name is not None
        ):
            inference_kwargs["model"] = self.model_name

        completion = self.client.messages.create(
            **messages,
            **inference_kwargs,
        )
        return completion.content[0].text

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "Anthropic does not support batch generation."
        )

    def generate_stream(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using Anthropic.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            As structured generation is not supported by Anthropic, the value
            of this argument must be `None`. Otherwise, an error will be
            raised at runtime.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        messages = self.type_adapter.format_input(model_input)

        if output_type is not None:
            raise NotImplementedError(
                f"The type {output_type} is not available with Anthropic."
            )

        if (
            "model" not in inference_kwargs
            and self.model_name is not None
        ):
            inference_kwargs["model"] = self.model_name

        stream = self.client.messages.create(
            **messages,
            stream=True,
            **inference_kwargs,
        )

        for chunk in stream:
            if (
                chunk.type == "content_block_delta"
                and chunk.delta.type == "text_delta"
            ):
                yield chunk.delta.text

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client Anthropic

An anthropic.Anthropic client.

required
model_name Optional[str]

The name of the model to use.

None
Source code in outlines/models/anthropic.py
def __init__(
    self, client: "AnthropicClient", model_name: Optional[str] = None
):
    """
    Parameters
    ----------
    client
        An `anthropic.Anthropic` client.
    model_name
        The name of the model to use.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = AnthropicTypeAdapter()

generate(model_input, output_type=None, **inference_kwargs)

Generate text using Anthropic.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

As structured generation is not supported by Anthropic, the value of this argument must be None. Otherwise, an error will be raised at runtime.

None
**inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
str

The response generated by the model.

Source code in outlines/models/anthropic.py
def generate(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> str:
    """Generate text using Anthropic.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        As structured generation is not supported by Anthropic, the value
        of this argument must be `None`. Otherwise, an error will be
        raised at runtime.
    **inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    str
        The response generated by the model.

    """
    messages = self.type_adapter.format_input(model_input)

    if output_type is not None:
        raise NotImplementedError(
            f"The type {output_type} is not available with Anthropic."
        )

    if (
        "model" not in inference_kwargs
        and self.model_name is not None
    ):
        inference_kwargs["model"] = self.model_name

    completion = self.client.messages.create(
        **messages,
        **inference_kwargs,
    )
    return completion.content[0].text

generate_stream(model_input, output_type=None, **inference_kwargs)

Stream text using Anthropic.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

As structured generation is not supported by Anthropic, the value of this argument must be None. Otherwise, an error will be raised at runtime.

None
**inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/anthropic.py
def generate_stream(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Iterator[str]:
    """Stream text using Anthropic.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        As structured generation is not supported by Anthropic, the value
        of this argument must be `None`. Otherwise, an error will be
        raised at runtime.
    **inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    messages = self.type_adapter.format_input(model_input)

    if output_type is not None:
        raise NotImplementedError(
            f"The type {output_type} is not available with Anthropic."
        )

    if (
        "model" not in inference_kwargs
        and self.model_name is not None
    ):
        inference_kwargs["model"] = self.model_name

    stream = self.client.messages.create(
        **messages,
        stream=True,
        **inference_kwargs,
    )

    for chunk in stream:
        if (
            chunk.type == "content_block_delta"
            and chunk.delta.type == "text_delta"
        ):
            yield chunk.delta.text

AsyncLMStudio

Bases: AsyncModel

Thin wrapper around a lmstudio.AsyncClient client.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the LMStudio async client.

Source code in outlines/models/lmstudio.py
class AsyncLMStudio(AsyncModel):
    """Thin wrapper around a `lmstudio.AsyncClient` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the LMStudio async client.

    """

    def __init__(
        self, client: "AsyncClient", model_name: Optional[str] = None
    ):
        """
        Parameters
        ----------
        client
            A LMStudio AsyncClient instance.
        model_name
            The name of the model to use. If not provided, uses the default
            loaded model in LMStudio.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = LMStudioTypeAdapter()
        self._context_entered = False

    async def close(self) -> None:
        """Close the async client and release resources."""
        if self._context_entered:
            await self.client.__aexit__(None, None, None)
            self._context_entered = False

    async def generate(
        self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> str:
        """Generate text using LMStudio asynchronously.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        str
            The text generated by the model.

        """
        if not self._context_entered:
            await self.client.__aenter__()
            self._context_entered = True

        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        model_key = kwargs.pop("model", None)
        model = await self.client.llm.model(model_key) if model_key else await self.client.llm.model()

        formatted_input = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if response_format is not None:
            kwargs["response_format"] = response_format

        result = await model.respond(formatted_input, **kwargs)
        return result.content

    async def generate_batch(
        self,
        model_input,
        output_type=None,
        **kwargs,
    ):
        raise NotImplementedError(
            "The `lmstudio` library does not support batch inference."
        )

    async def generate_stream(  # type: ignore
        self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> AsyncIterator[str]:
        """Stream text using LMStudio asynchronously.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        AsyncIterator[str]
            An async iterator that yields the text generated by the model.

        """
        if not self._context_entered:
            await self.client.__aenter__()
            self._context_entered = True

        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        model_key = kwargs.pop("model", None)
        model = await self.client.llm.model(model_key) if model_key else await self.client.llm.model()

        formatted_input = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if response_format is not None:
            kwargs["response_format"] = response_format

        stream = await model.respond_stream(formatted_input, **kwargs)
        async for fragment in stream:
            yield fragment.content

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client AsyncClient

A LMStudio AsyncClient instance.

required
model_name Optional[str]

The name of the model to use. If not provided, uses the default loaded model in LMStudio.

None
Source code in outlines/models/lmstudio.py
def __init__(
    self, client: "AsyncClient", model_name: Optional[str] = None
):
    """
    Parameters
    ----------
    client
        A LMStudio AsyncClient instance.
    model_name
        The name of the model to use. If not provided, uses the default
        loaded model in LMStudio.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = LMStudioTypeAdapter()
    self._context_entered = False

close() async

Close the async client and release resources.

Source code in outlines/models/lmstudio.py
async def close(self) -> None:
    """Close the async client and release resources."""
    if self._context_entered:
        await self.client.__aexit__(None, None, None)
        self._context_entered = False

generate(model_input, output_type=None, **kwargs) async

Generate text using LMStudio asynchronously.

Parameters:

Name Type Description Default
model_input Chat | str | list

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema.

None
**kwargs Any

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
str

The text generated by the model.

Source code in outlines/models/lmstudio.py
async def generate(
    self,
    model_input: Chat | str | list,
    output_type: Optional[Any] = None,
    **kwargs: Any,
) -> str:
    """Generate text using LMStudio asynchronously.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema.
    **kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    str
        The text generated by the model.

    """
    if not self._context_entered:
        await self.client.__aenter__()
        self._context_entered = True

    if "model" not in kwargs and self.model_name is not None:
        kwargs["model"] = self.model_name

    model_key = kwargs.pop("model", None)
    model = await self.client.llm.model(model_key) if model_key else await self.client.llm.model()

    formatted_input = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if response_format is not None:
        kwargs["response_format"] = response_format

    result = await model.respond(formatted_input, **kwargs)
    return result.content

generate_stream(model_input, output_type=None, **kwargs) async

Stream text using LMStudio asynchronously.

Parameters:

Name Type Description Default
model_input Chat | str | list

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema.

None
**kwargs Any

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
AsyncIterator[str]

An async iterator that yields the text generated by the model.

Source code in outlines/models/lmstudio.py
async def generate_stream(  # type: ignore
    self,
    model_input: Chat | str | list,
    output_type: Optional[Any] = None,
    **kwargs: Any,
) -> AsyncIterator[str]:
    """Stream text using LMStudio asynchronously.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema.
    **kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    AsyncIterator[str]
        An async iterator that yields the text generated by the model.

    """
    if not self._context_entered:
        await self.client.__aenter__()
        self._context_entered = True

    if "model" not in kwargs and self.model_name is not None:
        kwargs["model"] = self.model_name

    model_key = kwargs.pop("model", None)
    model = await self.client.llm.model(model_key) if model_key else await self.client.llm.model()

    formatted_input = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if response_format is not None:
        kwargs["response_format"] = response_format

    stream = await model.respond_stream(formatted_input, **kwargs)
    async for fragment in stream:
        yield fragment.content

AsyncMistral

Bases: AsyncModel

Async thin wrapper around the mistralai.Mistral client.

Converts input and output types to arguments for the mistralai.Mistral client's async methods (chat.complete_async or chat.stream_async).

Source code in outlines/models/mistral.py
class AsyncMistral(AsyncModel):
    """Async thin wrapper around the `mistralai.Mistral` client.

    Converts input and output types to arguments for the `mistralai.Mistral`
    client's async methods (`chat.complete_async` or `chat.stream_async`).

    """

    def __init__(
        self, client: "MistralClient", model_name: Optional[str] = None
    ):
        """
        Parameters
        ----------
        client : MistralClient
            A mistralai.Mistral client instance.
        model_name : Optional[str]
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = MistralTypeAdapter()

    async def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate a response from the model asynchronously.

        Parameters
        ----------
        model_input : Union[Chat, list, str]
            The prompt or chat messages to generate a response from.
        output_type : Optional[Any]
            The desired format of the response (e.g., JSON schema).
        **inference_kwargs : Any
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The response generated by the model as text.

        """
        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            result = await self.client.chat.complete_async(
                messages=messages,
                response_format=response_format,
                stream=False,
                **inference_kwargs,
            )
        except Exception as e:
            if "schema" in str(e).lower() or "json_schema" in str(e).lower():
                raise TypeError(
                    f"Mistral does not support your schema: {e}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise RuntimeError(f"Mistral API error: {e}") from e

        outputs = [choice.message for choice in result.choices]

        if len(outputs) == 1:
            return outputs[0].content
        else:
            return [m.content for m in outputs]

    async def generate_batch(
        self,
        model_input,
        output_type=None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "The mistralai library does not support batch inference."
        )

    async def generate_stream(
        self,
        model_input,
        output_type=None,
        **inference_kwargs,
    ):
        """Generate text from the model as an async stream of chunks.

        Parameters
        ----------
        model_input
            str, list, or chat input to generate from.
        output_type
            Optional type for structured output.
        **inference_kwargs
            Extra kwargs like "model" name.

        Yields
        ------
        str
            Chunks of text as they are streamed.

        """
        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            response = await self.client.chat.stream_async(
                messages=messages,
                response_format=response_format,
                **inference_kwargs
            )
        except Exception as e:
            if "schema" in str(e).lower() or "json_schema" in str(e).lower():
                raise TypeError(
                    f"Mistral does not support your schema: {e}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise RuntimeError(f"Mistral API error: {e}") from e

        async for chunk in response:
            if (
                hasattr(chunk, "data")
                and chunk.data.choices
                and len(chunk.data.choices) > 0
                and hasattr(chunk.data.choices[0], "delta")
                and chunk.data.choices[0].delta.content is not None
            ):
                yield chunk.data.choices[0].delta.content

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client Mistral

A mistralai.Mistral client instance.

required
model_name Optional[str]

The name of the model to use.

None
Source code in outlines/models/mistral.py
def __init__(
    self, client: "MistralClient", model_name: Optional[str] = None
):
    """
    Parameters
    ----------
    client : MistralClient
        A mistralai.Mistral client instance.
    model_name : Optional[str]
        The name of the model to use.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = MistralTypeAdapter()

generate(model_input, output_type=None, **inference_kwargs) async

Generate a response from the model asynchronously.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt or chat messages to generate a response from.

required
output_type Optional[Any]

The desired format of the response (e.g., JSON schema).

None
**inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Union[str, list[str]]

The response generated by the model as text.

Source code in outlines/models/mistral.py
async def generate(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Union[str, list[str]]:
    """Generate a response from the model asynchronously.

    Parameters
    ----------
    model_input : Union[Chat, list, str]
        The prompt or chat messages to generate a response from.
    output_type : Optional[Any]
        The desired format of the response (e.g., JSON schema).
    **inference_kwargs : Any
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Union[str, list[str]]
        The response generated by the model as text.

    """
    messages = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if "model" not in inference_kwargs and self.model_name is not None:
        inference_kwargs["model"] = self.model_name

    try:
        result = await self.client.chat.complete_async(
            messages=messages,
            response_format=response_format,
            stream=False,
            **inference_kwargs,
        )
    except Exception as e:
        if "schema" in str(e).lower() or "json_schema" in str(e).lower():
            raise TypeError(
                f"Mistral does not support your schema: {e}. "
                "Try a local model or dottxt instead."
            )
        else:
            raise RuntimeError(f"Mistral API error: {e}") from e

    outputs = [choice.message for choice in result.choices]

    if len(outputs) == 1:
        return outputs[0].content
    else:
        return [m.content for m in outputs]

generate_stream(model_input, output_type=None, **inference_kwargs) async

Generate text from the model as an async stream of chunks.

Parameters:

Name Type Description Default
model_input

str, list, or chat input to generate from.

required
output_type

Optional type for structured output.

None
**inference_kwargs

Extra kwargs like "model" name.

{}

Yields:

Type Description
str

Chunks of text as they are streamed.

Source code in outlines/models/mistral.py
async def generate_stream(
    self,
    model_input,
    output_type=None,
    **inference_kwargs,
):
    """Generate text from the model as an async stream of chunks.

    Parameters
    ----------
    model_input
        str, list, or chat input to generate from.
    output_type
        Optional type for structured output.
    **inference_kwargs
        Extra kwargs like "model" name.

    Yields
    ------
    str
        Chunks of text as they are streamed.

    """
    messages = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if "model" not in inference_kwargs and self.model_name is not None:
        inference_kwargs["model"] = self.model_name

    try:
        response = await self.client.chat.stream_async(
            messages=messages,
            response_format=response_format,
            **inference_kwargs
        )
    except Exception as e:
        if "schema" in str(e).lower() or "json_schema" in str(e).lower():
            raise TypeError(
                f"Mistral does not support your schema: {e}. "
                "Try a local model or dottxt instead."
            )
        else:
            raise RuntimeError(f"Mistral API error: {e}") from e

    async for chunk in response:
        if (
            hasattr(chunk, "data")
            and chunk.data.choices
            and len(chunk.data.choices) > 0
            and hasattr(chunk.data.choices[0], "delta")
            and chunk.data.choices[0].delta.content is not None
        ):
            yield chunk.data.choices[0].delta.content

AsyncOllama

Bases: AsyncModel

Thin wrapper around the ollama.AsyncClient client.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the ollama.AsyncClient client.

Source code in outlines/models/ollama.py
class AsyncOllama(AsyncModel):
    """Thin wrapper around the `ollama.AsyncClient` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `ollama.AsyncClient` client.

    """

    def __init__(
        self,client: "AsyncClient", model_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            The `ollama.Client` client.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = OllamaTypeAdapter()

    async def generate(self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> str:
        """Generate text using Ollama.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The text generated by the model.

        """
        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        response = await self.client.chat(
            messages=self.type_adapter.format_input(model_input),
            format=self.type_adapter.format_output_type(output_type),
            **kwargs,
        )
        return response.message.content

    async def generate_batch(
        self,
        model_input,
        output_type = None,
        **kwargs,
    ):
        raise NotImplementedError(
            "The `ollama` library does not support batch inference."
        )

    async def generate_stream( # type: ignore
        self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> AsyncIterator[str]:
        """Stream text using Ollama.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        stream = await self.client.chat(
            messages=self.type_adapter.format_input(model_input),
            format=self.type_adapter.format_output_type(output_type),
            stream=True,
            **kwargs,
        )
        async for chunk in stream:
            yield chunk.message.content

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client AsyncClient

The ollama.Client client.

required
model_name Optional[str]

The name of the model to use.

None
Source code in outlines/models/ollama.py
def __init__(
    self,client: "AsyncClient", model_name: Optional[str] = None,
):
    """
    Parameters
    ----------
    client
        The `ollama.Client` client.
    model_name
        The name of the model to use.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = OllamaTypeAdapter()

generate(model_input, output_type=None, **kwargs) async

Generate text using Ollama.

Parameters:

Name Type Description Default
model_input Chat | str | list

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema.

None
**kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
str

The text generated by the model.

Source code in outlines/models/ollama.py
async def generate(self,
    model_input: Chat | str | list,
    output_type: Optional[Any] = None,
    **kwargs: Any,
) -> str:
    """Generate text using Ollama.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema.
    **kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    str
        The text generated by the model.

    """
    if "model" not in kwargs and self.model_name is not None:
        kwargs["model"] = self.model_name

    response = await self.client.chat(
        messages=self.type_adapter.format_input(model_input),
        format=self.type_adapter.format_output_type(output_type),
        **kwargs,
    )
    return response.message.content

generate_stream(model_input, output_type=None, **kwargs) async

Stream text using Ollama.

Parameters:

Name Type Description Default
model_input Chat | str | list

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema.

None
**kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/ollama.py
async def generate_stream( # type: ignore
    self,
    model_input: Chat | str | list,
    output_type: Optional[Any] = None,
    **kwargs: Any,
) -> AsyncIterator[str]:
    """Stream text using Ollama.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema.
    **kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    if "model" not in kwargs and self.model_name is not None:
        kwargs["model"] = self.model_name

    stream = await self.client.chat(
        messages=self.type_adapter.format_input(model_input),
        format=self.type_adapter.format_output_type(output_type),
        stream=True,
        **kwargs,
    )
    async for chunk in stream:
        yield chunk.message.content

AsyncOpenAI

Bases: AsyncModel

Thin wrapper around the openai.AsyncOpenAI client.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the openai.AsyncOpenAI client.

Source code in outlines/models/openai.py
class AsyncOpenAI(AsyncModel):
    """Thin wrapper around the `openai.AsyncOpenAI` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.AsyncOpenAI` client.

    """

    def __init__(
        self,
        client: Union["AsyncOpenAIClient", "AsyncAzureOpenAIClient"],
        model_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            The `openai.AsyncOpenAI` or `openai.AsyncAzureOpenAI` client.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = OpenAITypeAdapter()

    async def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Union[type[BaseModel], str]] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using OpenAI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema or an empty dictionary.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        import openai

        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            result = await self.client.chat.completions.create(
                messages=messages,
                **response_format,
                **inference_kwargs,
            )
        except openai.BadRequestError as e:
            if e.body["message"].startswith("Invalid schema"):
                raise TypeError(
                    f"OpenAI does not support your schema: {e.body['message']}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise e

        messages = [choice.message for choice in result.choices]
        for message in messages:
            if message.refusal is not None:
                raise ValueError(
                    f"OpenAI refused to answer the request: {message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    async def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "The `openai` library does not support batch inference."
        )

    async def generate_stream( # type: ignore
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Union[type[BaseModel], str]] = None,
        **inference_kwargs,
    ) -> AsyncIterator[str]:
        """Stream text using OpenAI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema or an empty dictionary.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        import openai

        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            stream = await self.client.chat.completions.create(
                stream=True,
                messages=messages,
                **response_format,
                **inference_kwargs
            )
        except openai.BadRequestError as e:
            if e.body["message"].startswith("Invalid schema"):
                raise TypeError(
                    f"OpenAI does not support your schema: {e.body['message']}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise e

        async for chunk in stream:
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client Union[AsyncOpenAI, AsyncAzureOpenAI]

The openai.AsyncOpenAI or openai.AsyncAzureOpenAI client.

required
model_name Optional[str]

The name of the model to use.

None
Source code in outlines/models/openai.py
def __init__(
    self,
    client: Union["AsyncOpenAIClient", "AsyncAzureOpenAIClient"],
    model_name: Optional[str] = None,
):
    """
    Parameters
    ----------
    client
        The `openai.AsyncOpenAI` or `openai.AsyncAzureOpenAI` client.
    model_name
        The name of the model to use.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = OpenAITypeAdapter()

generate(model_input, output_type=None, **inference_kwargs) async

Generate text using OpenAI.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt based on which the model will generate a response.

required
output_type Optional[Union[type[BaseModel], str]]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema or an empty dictionary.

None
**inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Union[str, list[str]]

The text generated by the model.

Source code in outlines/models/openai.py
async def generate(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Union[type[BaseModel], str]] = None,
    **inference_kwargs: Any,
) -> Union[str, list[str]]:
    """Generate text using OpenAI.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema or an empty dictionary.
    **inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Union[str, list[str]]
        The text generated by the model.

    """
    import openai

    messages = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if "model" not in inference_kwargs and self.model_name is not None:
        inference_kwargs["model"] = self.model_name

    try:
        result = await self.client.chat.completions.create(
            messages=messages,
            **response_format,
            **inference_kwargs,
        )
    except openai.BadRequestError as e:
        if e.body["message"].startswith("Invalid schema"):
            raise TypeError(
                f"OpenAI does not support your schema: {e.body['message']}. "
                "Try a local model or dottxt instead."
            )
        else:
            raise e

    messages = [choice.message for choice in result.choices]
    for message in messages:
        if message.refusal is not None:
            raise ValueError(
                f"OpenAI refused to answer the request: {message.refusal}"
            )

    if len(messages) == 1:
        return messages[0].content
    else:
        return [message.content for message in messages]

generate_stream(model_input, output_type=None, **inference_kwargs) async

Stream text using OpenAI.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt based on which the model will generate a response.

required
output_type Optional[Union[type[BaseModel], str]]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema or an empty dictionary.

None
**inference_kwargs

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/openai.py
async def generate_stream( # type: ignore
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Union[type[BaseModel], str]] = None,
    **inference_kwargs,
) -> AsyncIterator[str]:
    """Stream text using OpenAI.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema or an empty dictionary.
    **inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    import openai

    messages = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if "model" not in inference_kwargs and self.model_name is not None:
        inference_kwargs["model"] = self.model_name

    try:
        stream = await self.client.chat.completions.create(
            stream=True,
            messages=messages,
            **response_format,
            **inference_kwargs
        )
    except openai.BadRequestError as e:
        if e.body["message"].startswith("Invalid schema"):
            raise TypeError(
                f"OpenAI does not support your schema: {e.body['message']}. "
                "Try a local model or dottxt instead."
            )
        else:
            raise e

    async for chunk in stream:
        if chunk.choices and chunk.choices[0].delta.content is not None:
            yield chunk.choices[0].delta.content

AsyncSGLang

Bases: AsyncModel

Thin async wrapper around the openai.OpenAI client used to communicate with an SGLang server.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the openai.OpenAI client for the SGLang server.

Source code in outlines/models/sglang.py
class AsyncSGLang(AsyncModel):
    """Thin async wrapper around the `openai.OpenAI` client used to communicate
    with an SGLang server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.OpenAI` client for the
    SGLang server.

    """

    def __init__(self, client, model_name: Optional[str] = None):
        """
        Parameters
        ----------
        client
            An `openai.AsyncOpenAI` client instance.
        model_name
            The name of the model to use.

        Parameters
        ----------
        client
            An `openai.AsyncOpenAI` client instance.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = SGLangTypeAdapter()

    async def generate(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using `sglang`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        response = await self.client.chat.completions.create(**client_args)

        messages = [choice.message for choice in response.choices]
        for message in messages:
            if message.refusal is not None:  # pragma: no cover
                raise ValueError(
                    f"The sglang server refused to answer the request: "
                    f"{message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    async def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "SGLang does not support batch inference."
        )

    async def generate_stream( # type: ignore
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> AsyncIterator[str]:
        """Return a text generator.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        AsyncIterator[str]
            An async iterator that yields the text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = await self.client.chat.completions.create(
            **client_args,
            stream=True,
        )

        async for chunk in stream:  # pragma: no cover
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

    def _build_client_args(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the SGLang client."""
        messages = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        inference_kwargs.update(output_type_args)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        client_args = {
            "messages": messages,
            **inference_kwargs,
        }

        return client_args

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client

An openai.AsyncOpenAI client instance.

required
model_name Optional[str]

The name of the model to use.

None

Parameters:

Name Type Description Default
client

An openai.AsyncOpenAI client instance.

required
Source code in outlines/models/sglang.py
def __init__(self, client, model_name: Optional[str] = None):
    """
    Parameters
    ----------
    client
        An `openai.AsyncOpenAI` client instance.
    model_name
        The name of the model to use.

    Parameters
    ----------
    client
        An `openai.AsyncOpenAI` client instance.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = SGLangTypeAdapter()

generate(model_input, output_type=None, **inference_kwargs) async

Generate text using sglang.

Parameters:

Name Type Description Default
model_input Union[Chat, str, list]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Union[str, list[str]]

The text generated by the model.

Source code in outlines/models/sglang.py
async def generate(
    self,
    model_input: Union[Chat, str, list],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Union[str, list[str]]:
    """Generate text using `sglang`.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Union[str, list[str]]
        The text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    response = await self.client.chat.completions.create(**client_args)

    messages = [choice.message for choice in response.choices]
    for message in messages:
        if message.refusal is not None:  # pragma: no cover
            raise ValueError(
                f"The sglang server refused to answer the request: "
                f"{message.refusal}"
            )

    if len(messages) == 1:
        return messages[0].content
    else:
        return [message.content for message in messages]

generate_stream(model_input, output_type=None, **inference_kwargs) async

Return a text generator.

Parameters:

Name Type Description Default
model_input Union[Chat, str, list]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
AsyncIterator[str]

An async iterator that yields the text generated by the model.

Source code in outlines/models/sglang.py
async def generate_stream( # type: ignore
    self,
    model_input: Union[Chat, str, list],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> AsyncIterator[str]:
    """Return a text generator.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    AsyncIterator[str]
        An async iterator that yields the text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    stream = await self.client.chat.completions.create(
        **client_args,
        stream=True,
    )

    async for chunk in stream:  # pragma: no cover
        if chunk.choices and chunk.choices[0].delta.content is not None:
            yield chunk.choices[0].delta.content

AsyncTGI

Bases: AsyncModel

Thin async wrapper around a huggingface_hub.AsyncInferenceClient client used to communicate with a TGI server.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the huggingface_hub.AsyncInferenceClient client.

Source code in outlines/models/tgi.py
class AsyncTGI(AsyncModel):
    """Thin async wrapper around a `huggingface_hub.AsyncInferenceClient`
    client used to communicate with a `TGI` server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the
    `huggingface_hub.AsyncInferenceClient` client.

    """

    def __init__(self, client):
        """
        Parameters
        ----------
        client
            A huggingface `AsyncInferenceClient` client instance.

        """
        self.client = client
        self.type_adapter = TGITypeAdapter()

    async def generate(
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> str:
        """Generate text using TGI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types except `CFG` are supported provided your server uses
            a backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        response = await self.client.text_generation(**client_args)

        return response

    async def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError("TGI does not support batch inference.")

    async def generate_stream( # type: ignore
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> AsyncIterator[str]:
        """Stream text using TGI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types except `CFG` are supported provided your server uses
            a backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        AsyncIterator[str]
            An async iterator that yields the text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = await self.client.text_generation(
            **client_args, stream=True
        )

        async for chunk in stream:  # pragma: no cover
            yield chunk

    def _build_client_args(
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the TGI client."""
        prompt = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        inference_kwargs.update(output_type_args)

        client_args = {
            "prompt": prompt,
            **inference_kwargs,
        }

        return client_args

__init__(client)

Parameters:

Name Type Description Default
client

A huggingface AsyncInferenceClient client instance.

required
Source code in outlines/models/tgi.py
def __init__(self, client):
    """
    Parameters
    ----------
    client
        A huggingface `AsyncInferenceClient` client instance.

    """
    self.client = client
    self.type_adapter = TGITypeAdapter()

generate(model_input, output_type=None, **inference_kwargs) async

Generate text using TGI.

Parameters:

Name Type Description Default
model_input str

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types except CFG are supported provided your server uses a backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
str

The text generated by the model.

Source code in outlines/models/tgi.py
async def generate(
    self,
    model_input: str,
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> str:
    """Generate text using TGI.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types except `CFG` are supported provided your server uses
        a backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    str
        The text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    response = await self.client.text_generation(**client_args)

    return response

generate_stream(model_input, output_type=None, **inference_kwargs) async

Stream text using TGI.

Parameters:

Name Type Description Default
model_input str

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types except CFG are supported provided your server uses a backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
AsyncIterator[str]

An async iterator that yields the text generated by the model.

Source code in outlines/models/tgi.py
async def generate_stream( # type: ignore
    self,
    model_input: str,
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> AsyncIterator[str]:
    """Stream text using TGI.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types except `CFG` are supported provided your server uses
        a backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    AsyncIterator[str]
        An async iterator that yields the text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    stream = await self.client.text_generation(
        **client_args, stream=True
    )

    async for chunk in stream:  # pragma: no cover
        yield chunk

AsyncVLLM

Bases: AsyncModel

Thin async wrapper around the openai.OpenAI client used to communicate with a vllm server.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the openai.OpenAI client for the vllm server.

Source code in outlines/models/vllm.py
class AsyncVLLM(AsyncModel):
    """Thin async wrapper around the `openai.OpenAI` client used to communicate
    with a `vllm` server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.OpenAI` client for the
    `vllm` server.
    """

    def __init__(
        self,
        client: "AsyncOpenAI",
        model_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            An `openai.AsyncOpenAI` client instance.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = VLLMTypeAdapter()

    async def generate(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using vLLM.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        response = await self.client.chat.completions.create(**client_args)

        messages = [choice.message for choice in response.choices]
        for message in messages:
            if message.refusal is not None:  # pragma: no cover
                raise ValueError(
                    f"The vLLM server refused to answer the request: "
                    f"{message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    async def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError("VLLM does not support batch inference.")

    async def generate_stream( # type: ignore
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> AsyncIterator[str]:
        """Stream text using vLLM.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        AsyncIterator[str]
            An async iterator that yields the text generated by the model.
        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = await self.client.chat.completions.create(
            **client_args,
            stream=True,
        )

        async for chunk in stream:  # pragma: no cover
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

    def _build_client_args(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the OpenAI client."""
        messages = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        extra_body = inference_kwargs.pop("extra_body", {})
        extra_body.update(output_type_args)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        client_args = {
            "messages": messages,
            **inference_kwargs,
        }
        if extra_body:
            client_args["extra_body"] = extra_body

        return client_args

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client AsyncOpenAI

An openai.AsyncOpenAI client instance.

required
Source code in outlines/models/vllm.py
def __init__(
    self,
    client: "AsyncOpenAI",
    model_name: Optional[str] = None,
):
    """
    Parameters
    ----------
    client
        An `openai.AsyncOpenAI` client instance.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = VLLMTypeAdapter()

generate(model_input, output_type=None, **inference_kwargs) async

Generate text using vLLM.

Parameters:

Name Type Description Default
model_input Union[Chat, str, list]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Union[str, list[str]]

The text generated by the model.

Source code in outlines/models/vllm.py
async def generate(
    self,
    model_input: Union[Chat, str, list],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Union[str, list[str]]:
    """Generate text using vLLM.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Union[str, list[str]]
        The text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    response = await self.client.chat.completions.create(**client_args)

    messages = [choice.message for choice in response.choices]
    for message in messages:
        if message.refusal is not None:  # pragma: no cover
            raise ValueError(
                f"The vLLM server refused to answer the request: "
                f"{message.refusal}"
            )

    if len(messages) == 1:
        return messages[0].content
    else:
        return [message.content for message in messages]

generate_stream(model_input, output_type=None, **inference_kwargs) async

Stream text using vLLM.

Parameters:

Name Type Description Default
model_input Union[Chat, str, list]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
AsyncIterator[str]

An async iterator that yields the text generated by the model.

Source code in outlines/models/vllm.py
async def generate_stream( # type: ignore
    self,
    model_input: Union[Chat, str, list],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> AsyncIterator[str]:
    """Stream text using vLLM.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    AsyncIterator[str]
        An async iterator that yields the text generated by the model.
    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    stream = await self.client.chat.completions.create(
        **client_args,
        stream=True,
    )

    async for chunk in stream:  # pragma: no cover
        if chunk.choices and chunk.choices[0].delta.content is not None:
            yield chunk.choices[0].delta.content

Dottxt

Bases: Model

Thin wrapper around the dottxt.client.Dottxt client.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the dottxt.client.Dottxt client.

Source code in outlines/models/dottxt.py
class Dottxt(Model):
    """Thin wrapper around the `dottxt.client.Dottxt` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `dottxt.client.Dottxt` client.

    """

    def __init__(
        self,
        client: "DottxtClient",
        model_name: Optional[str] = None,
        model_revision: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            A `dottxt.Dottxt` client.
        model_name
            The name of the model to use.
        model_revision
            The revision of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.model_revision = model_revision
        self.type_adapter = DottxtTypeAdapter()

    def generate(
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> str:
        """Generate text using Dottxt.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The text generated by the model.

        """
        prompt = self.type_adapter.format_input(model_input)
        json_schema = self.type_adapter.format_output_type(output_type)

        if (
            "model_name" not in inference_kwargs
            and self.model_name is not None
        ):
            inference_kwargs["model_name"] = self.model_name

        if (
            "model_revision" not in inference_kwargs
            and self.model_revision is not None
        ):
            inference_kwargs["model_revision"] = self.model_revision

        completion = self.client.json(
            prompt,
            json_schema,
            **inference_kwargs,
        )
        return completion.data

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "Dottxt does not support batch generation."
        )

    def generate_stream(
        self,
        model_input,
        output_type=None,
        **inference_kwargs,
    ):
        """Not available for Dottxt."""
        raise NotImplementedError(
            "Dottxt does not support streaming. Call the model/generator for "
            + "regular generation instead."
        )

__init__(client, model_name=None, model_revision=None)

Parameters:

Name Type Description Default
client Dottxt

A dottxt.Dottxt client.

required
model_name Optional[str]

The name of the model to use.

None
model_revision Optional[str]

The revision of the model to use.

None
Source code in outlines/models/dottxt.py
def __init__(
    self,
    client: "DottxtClient",
    model_name: Optional[str] = None,
    model_revision: Optional[str] = None,
):
    """
    Parameters
    ----------
    client
        A `dottxt.Dottxt` client.
    model_name
        The name of the model to use.
    model_revision
        The revision of the model to use.

    """
    self.client = client
    self.model_name = model_name
    self.model_revision = model_revision
    self.type_adapter = DottxtTypeAdapter()

generate(model_input, output_type=None, **inference_kwargs)

Generate text using Dottxt.

Parameters:

Name Type Description Default
model_input str

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema.

None
**inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
str

The text generated by the model.

Source code in outlines/models/dottxt.py
def generate(
    self,
    model_input: str,
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> str:
    """Generate text using Dottxt.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema.
    **inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    str
        The text generated by the model.

    """
    prompt = self.type_adapter.format_input(model_input)
    json_schema = self.type_adapter.format_output_type(output_type)

    if (
        "model_name" not in inference_kwargs
        and self.model_name is not None
    ):
        inference_kwargs["model_name"] = self.model_name

    if (
        "model_revision" not in inference_kwargs
        and self.model_revision is not None
    ):
        inference_kwargs["model_revision"] = self.model_revision

    completion = self.client.json(
        prompt,
        json_schema,
        **inference_kwargs,
    )
    return completion.data

generate_stream(model_input, output_type=None, **inference_kwargs)

Not available for Dottxt.

Source code in outlines/models/dottxt.py
def generate_stream(
    self,
    model_input,
    output_type=None,
    **inference_kwargs,
):
    """Not available for Dottxt."""
    raise NotImplementedError(
        "Dottxt does not support streaming. Call the model/generator for "
        + "regular generation instead."
    )

Gemini

Bases: Model

Thin wrapper around the google.genai.Client client.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the google.genai.Client client.

Source code in outlines/models/gemini.py
class Gemini(Model):
    """Thin wrapper around the `google.genai.Client` client.

    This wrapper is used to convert the input and output types specified by
    the users at a higher level to arguments to the `google.genai.Client`
    client.

    """

    def __init__(self, client: "Client", model_name: Optional[str] = None):
        """
        Parameters
        ----------
        client
            A `google.genai.Client` instance.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = GeminiTypeAdapter()

    def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs,
    ) -> str:
        """Generate a response from the model.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema, a list of such types, or a multiple choice type.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The response generated by the model.

        """
        contents = self.type_adapter.format_input(model_input)
        generation_config = self.type_adapter.format_output_type(output_type)

        completion = self.client.models.generate_content(
            **contents,
            model=inference_kwargs.pop("model", self.model_name),
            config={**generation_config, **inference_kwargs}
        )

        return completion.text

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "Gemini does not support batch generation."
        )

    def generate_stream(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs,
    ) -> Iterator[str]:
        """Generate a stream of responses from the model.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema, a list of such types, or a multiple choice type.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        contents = self.type_adapter.format_input(model_input)
        generation_config = self.type_adapter.format_output_type(output_type)

        stream = self.client.models.generate_content_stream(
            **contents,
            model=inference_kwargs.pop("model", self.model_name),
            config={**generation_config, **inference_kwargs},
        )

        for chunk in stream:
            if hasattr(chunk, "text") and chunk.text:
                yield chunk.text

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client Client

A google.genai.Client instance.

required
model_name Optional[str]

The name of the model to use.

None
Source code in outlines/models/gemini.py
def __init__(self, client: "Client", model_name: Optional[str] = None):
    """
    Parameters
    ----------
    client
        A `google.genai.Client` instance.
    model_name
        The name of the model to use.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = GeminiTypeAdapter()

generate(model_input, output_type=None, **inference_kwargs)

Generate a response from the model.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema, a list of such types, or a multiple choice type.

None
**inference_kwargs

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
str

The response generated by the model.

Source code in outlines/models/gemini.py
def generate(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Any] = None,
    **inference_kwargs,
) -> str:
    """Generate a response from the model.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema, a list of such types, or a multiple choice type.
    **inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    str
        The response generated by the model.

    """
    contents = self.type_adapter.format_input(model_input)
    generation_config = self.type_adapter.format_output_type(output_type)

    completion = self.client.models.generate_content(
        **contents,
        model=inference_kwargs.pop("model", self.model_name),
        config={**generation_config, **inference_kwargs}
    )

    return completion.text

generate_stream(model_input, output_type=None, **inference_kwargs)

Generate a stream of responses from the model.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema, a list of such types, or a multiple choice type.

None
**inference_kwargs

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/gemini.py
def generate_stream(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Any] = None,
    **inference_kwargs,
) -> Iterator[str]:
    """Generate a stream of responses from the model.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema, a list of such types, or a multiple choice type.
    **inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    contents = self.type_adapter.format_input(model_input)
    generation_config = self.type_adapter.format_output_type(output_type)

    stream = self.client.models.generate_content_stream(
        **contents,
        model=inference_kwargs.pop("model", self.model_name),
        config={**generation_config, **inference_kwargs},
    )

    for chunk in stream:
        if hasattr(chunk, "text") and chunk.text:
            yield chunk.text

LMStudio

Bases: Model

Thin wrapper around a lmstudio.Client client.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the LMStudio client.

Source code in outlines/models/lmstudio.py
class LMStudio(Model):
    """Thin wrapper around a `lmstudio.Client` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the LMStudio client.

    """

    def __init__(self, client: "Client", model_name: Optional[str] = None):
        """
        Parameters
        ----------
        client
            A LMStudio Client instance obtained via `lmstudio.Client()` or
            `lmstudio.get_default_client()`.
        model_name
            The name of the model to use. If not provided, uses the default
            loaded model in LMStudio.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = LMStudioTypeAdapter()

    def generate(
        self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> str:
        """Generate text using LMStudio.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        str
            The text generated by the model.

        """
        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        model_key = kwargs.pop("model", None)
        model = self.client.llm.model(model_key) if model_key else self.client.llm.model()

        formatted_input = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if response_format is not None:
            kwargs["response_format"] = response_format

        result = model.respond(formatted_input, **kwargs)
        return result.content

    def generate_batch(
        self,
        model_input,
        output_type=None,
        **kwargs,
    ):
        raise NotImplementedError(
            "The `lmstudio` library does not support batch inference."
        )

    def generate_stream(
        self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using LMStudio.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        model_key = kwargs.pop("model", None)
        model = self.client.llm.model(model_key) if model_key else self.client.llm.model()

        formatted_input = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if response_format is not None:
            kwargs["response_format"] = response_format

        stream = model.respond_stream(formatted_input, **kwargs)
        for fragment in stream:
            yield fragment.content

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client Client

A LMStudio Client instance obtained via lmstudio.Client() or lmstudio.get_default_client().

required
model_name Optional[str]

The name of the model to use. If not provided, uses the default loaded model in LMStudio.

None
Source code in outlines/models/lmstudio.py
def __init__(self, client: "Client", model_name: Optional[str] = None):
    """
    Parameters
    ----------
    client
        A LMStudio Client instance obtained via `lmstudio.Client()` or
        `lmstudio.get_default_client()`.
    model_name
        The name of the model to use. If not provided, uses the default
        loaded model in LMStudio.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = LMStudioTypeAdapter()

generate(model_input, output_type=None, **kwargs)

Generate text using LMStudio.

Parameters:

Name Type Description Default
model_input Chat | str | list

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema.

None
**kwargs Any

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
str

The text generated by the model.

Source code in outlines/models/lmstudio.py
def generate(
    self,
    model_input: Chat | str | list,
    output_type: Optional[Any] = None,
    **kwargs: Any,
) -> str:
    """Generate text using LMStudio.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema.
    **kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    str
        The text generated by the model.

    """
    if "model" not in kwargs and self.model_name is not None:
        kwargs["model"] = self.model_name

    model_key = kwargs.pop("model", None)
    model = self.client.llm.model(model_key) if model_key else self.client.llm.model()

    formatted_input = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if response_format is not None:
        kwargs["response_format"] = response_format

    result = model.respond(formatted_input, **kwargs)
    return result.content

generate_stream(model_input, output_type=None, **kwargs)

Stream text using LMStudio.

Parameters:

Name Type Description Default
model_input Chat | str | list

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema.

None
**kwargs Any

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/lmstudio.py
def generate_stream(
    self,
    model_input: Chat | str | list,
    output_type: Optional[Any] = None,
    **kwargs: Any,
) -> Iterator[str]:
    """Stream text using LMStudio.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema.
    **kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    if "model" not in kwargs and self.model_name is not None:
        kwargs["model"] = self.model_name

    model_key = kwargs.pop("model", None)
    model = self.client.llm.model(model_key) if model_key else self.client.llm.model()

    formatted_input = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if response_format is not None:
        kwargs["response_format"] = response_format

    stream = model.respond_stream(formatted_input, **kwargs)
    for fragment in stream:
        yield fragment.content

LlamaCpp

Bases: Model

Thin wrapper around the llama_cpp.Llama model.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the llama_cpp.Llama model.

Source code in outlines/models/llamacpp.py
class LlamaCpp(Model):
    """Thin wrapper around the `llama_cpp.Llama` model.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `llama_cpp.Llama` model.
    """

    tensor_library_name = "numpy"

    def __init__(self, model: "Llama", chat_mode: bool = True):
        """
        Parameters
        ----------
        model
            A `llama_cpp.Llama` model instance.
        chat_mode
            Whether to enable chat mode. If `False`, the model will regard
            all `str` inputs as plain text prompts. If `True`, the model will
            regard all `str` inputs as user messages in a chat conversation.

        """
        self.model = model
        self.tokenizer = LlamaCppTokenizer(self.model)

        # Note: llama-cpp-python provides a default chat-template fallback even when
        # the user hasn't explicitly configured one:
        # https://github.com/abetlen/llama-cpp-python/blob/c37132b/llama_cpp/llama.py#L540-L545
        # We keep the default as True because the upstream library generally favors chat-style usage.
        self.type_adapter = LlamaCppTypeAdapter(has_chat_template=chat_mode)

    def generate(
        self,
        model_input: Union[Chat, str],
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **inference_kwargs: Any,
    ) -> str:
        """Generate text using `llama-cpp-python`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        **inference_kwargs
            Additional keyword arguments to pass to the `Llama.__call__`
            method of the `llama-cpp-python` library.

        Returns
        -------
        str
            The text generated by the model.

        """
        prompt = self.type_adapter.format_input(model_input)

        if isinstance(prompt, str):
            completion = self.model(
                prompt,
                logits_processor=self.type_adapter.format_output_type(output_type),
                **inference_kwargs,
            )
            result = completion["choices"][0]["text"]
        elif isinstance(prompt, list):
            completion = self.model.create_chat_completion(
                prompt,
                logits_processor=self.type_adapter.format_output_type(output_type),
                **inference_kwargs,
            )
            result = completion["choices"][0]["message"]["content"]
        else:  # Never reached  # pragma: no cover
            raise ValueError("Unexpected prompt type.")

        self.model.reset()

        return result

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError("LlamaCpp does not support batch generation.")

    def generate_stream(
        self,
        model_input: Union[Chat, str],
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **inference_kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using `llama-cpp-python`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        **inference_kwargs
            Additional keyword arguments to pass to the `Llama.__call__`
            method of the `llama-cpp-python` library.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        prompt = self.type_adapter.format_input(model_input)

        if isinstance(prompt, str):
            generator = self.model(
                prompt,
                logits_processor=self.type_adapter.format_output_type(output_type),
                stream=True,
                **inference_kwargs,
            )
            for chunk in generator:
                yield chunk["choices"][0]["text"]

        elif isinstance(prompt, list):
            generator = self.model.create_chat_completion(
                prompt,
                logits_processor=self.type_adapter.format_output_type(output_type),
                stream=True,
                **inference_kwargs,
            )
            for chunk in generator:
                yield chunk["choices"][0]["delta"].get("content", "")
        else:  # Never reached  # pragma: no cover
            raise ValueError("Unexpected prompt type.")

__init__(model, chat_mode=True)

Parameters:

Name Type Description Default
model Llama

A llama_cpp.Llama model instance.

required
chat_mode bool

Whether to enable chat mode. If False, the model will regard all str inputs as plain text prompts. If True, the model will regard all str inputs as user messages in a chat conversation.

True
Source code in outlines/models/llamacpp.py
def __init__(self, model: "Llama", chat_mode: bool = True):
    """
    Parameters
    ----------
    model
        A `llama_cpp.Llama` model instance.
    chat_mode
        Whether to enable chat mode. If `False`, the model will regard
        all `str` inputs as plain text prompts. If `True`, the model will
        regard all `str` inputs as user messages in a chat conversation.

    """
    self.model = model
    self.tokenizer = LlamaCppTokenizer(self.model)

    # Note: llama-cpp-python provides a default chat-template fallback even when
    # the user hasn't explicitly configured one:
    # https://github.com/abetlen/llama-cpp-python/blob/c37132b/llama_cpp/llama.py#L540-L545
    # We keep the default as True because the upstream library generally favors chat-style usage.
    self.type_adapter = LlamaCppTypeAdapter(has_chat_template=chat_mode)

generate(model_input, output_type=None, **inference_kwargs)

Generate text using llama-cpp-python.

Parameters:

Name Type Description Default
model_input Union[Chat, str]

The prompt based on which the model will generate a response.

required
output_type Optional[OutlinesLogitsProcessor]

The logits processor the model will use to constrain the format of the generated text.

None
**inference_kwargs Any

Additional keyword arguments to pass to the Llama.__call__ method of the llama-cpp-python library.

{}

Returns:

Type Description
str

The text generated by the model.

Source code in outlines/models/llamacpp.py
def generate(
    self,
    model_input: Union[Chat, str],
    output_type: Optional[OutlinesLogitsProcessor] = None,
    **inference_kwargs: Any,
) -> str:
    """Generate text using `llama-cpp-python`.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The logits processor the model will use to constrain the format of
        the generated text.
    **inference_kwargs
        Additional keyword arguments to pass to the `Llama.__call__`
        method of the `llama-cpp-python` library.

    Returns
    -------
    str
        The text generated by the model.

    """
    prompt = self.type_adapter.format_input(model_input)

    if isinstance(prompt, str):
        completion = self.model(
            prompt,
            logits_processor=self.type_adapter.format_output_type(output_type),
            **inference_kwargs,
        )
        result = completion["choices"][0]["text"]
    elif isinstance(prompt, list):
        completion = self.model.create_chat_completion(
            prompt,
            logits_processor=self.type_adapter.format_output_type(output_type),
            **inference_kwargs,
        )
        result = completion["choices"][0]["message"]["content"]
    else:  # Never reached  # pragma: no cover
        raise ValueError("Unexpected prompt type.")

    self.model.reset()

    return result

generate_stream(model_input, output_type=None, **inference_kwargs)

Stream text using llama-cpp-python.

Parameters:

Name Type Description Default
model_input Union[Chat, str]

The prompt based on which the model will generate a response.

required
output_type Optional[OutlinesLogitsProcessor]

The logits processor the model will use to constrain the format of the generated text.

None
**inference_kwargs Any

Additional keyword arguments to pass to the Llama.__call__ method of the llama-cpp-python library.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/llamacpp.py
def generate_stream(
    self,
    model_input: Union[Chat, str],
    output_type: Optional[OutlinesLogitsProcessor] = None,
    **inference_kwargs: Any,
) -> Iterator[str]:
    """Stream text using `llama-cpp-python`.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The logits processor the model will use to constrain the format of
        the generated text.
    **inference_kwargs
        Additional keyword arguments to pass to the `Llama.__call__`
        method of the `llama-cpp-python` library.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    prompt = self.type_adapter.format_input(model_input)

    if isinstance(prompt, str):
        generator = self.model(
            prompt,
            logits_processor=self.type_adapter.format_output_type(output_type),
            stream=True,
            **inference_kwargs,
        )
        for chunk in generator:
            yield chunk["choices"][0]["text"]

    elif isinstance(prompt, list):
        generator = self.model.create_chat_completion(
            prompt,
            logits_processor=self.type_adapter.format_output_type(output_type),
            stream=True,
            **inference_kwargs,
        )
        for chunk in generator:
            yield chunk["choices"][0]["delta"].get("content", "")
    else:  # Never reached  # pragma: no cover
        raise ValueError("Unexpected prompt type.")

MLXLM

Bases: Model

Thin wrapper around an mlx_lm model.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the mlx_lm library.

Source code in outlines/models/mlxlm.py
class MLXLM(Model):
    """Thin wrapper around an `mlx_lm` model.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `mlx_lm` library.

    """

    tensor_library_name = "mlx"

    def __init__(
        self,
        model: "nn.Module",
        tokenizer: "PreTrainedTokenizer",
    ):
        """
        Parameters
        ----------
        model
            An instance of an `mlx_lm` model.
        tokenizer
            An instance of an `mlx_lm` tokenizer or of a compatible
            `transformers` tokenizer.

        """
        self.model = model
        # self.mlx_tokenizer is used by the mlx-lm in its generate function
        self.mlx_tokenizer = tokenizer
        # self.tokenizer is used by the logits processor
        self.tokenizer = TransformerTokenizer(tokenizer._tokenizer)
        self.type_adapter = MLXLMTypeAdapter(
            tokenizer=tokenizer,
            has_chat_template=_check_hf_chat_template(tokenizer)
        )

    def generate(
        self,
        model_input: str,
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **kwargs,
    ) -> str:
        """Generate text using `mlx-lm`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        kwargs
            Additional keyword arguments to pass to the `mlx-lm` library.

        Returns
        -------
        str
            The text generated by the model.

        """
        from mlx_lm import generate

        return generate(
            self.model,
            self.mlx_tokenizer,
            self.type_adapter.format_input(model_input),
            logits_processors=self.type_adapter.format_output_type(output_type),
            **kwargs,
        )

    def generate_batch(
        self,
        model_input: list[str],
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **kwargs,
    ) -> list[str]:
        """Generate a batch of text using `mlx-lm`.

        Parameters
        ----------
        model_input
            The list of prompts based on which the model will generate a response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        kwargs
            Additional keyword arguments to pass to the `mlx-lm` library.

        Returns
        -------
        list[str]
            The list of text generated by the model.

        """
        from mlx_lm import batch_generate

        if output_type:
            raise NotImplementedError(
                "mlx-lm does not support constrained generation with batching."
                + "You cannot provide an `output_type` with this method."
            )

        model_input = [self.type_adapter.format_input(item) for item in model_input]

        # Contrarily to the other generate methods, batch_generate requires
        # tokenized prompts
        add_special_tokens = [
            (
                self.mlx_tokenizer.bos_token is None
                or not prompt.startswith(self.mlx_tokenizer.bos_token)
            )
            for prompt in model_input
        ]
        tokenized_model_input = [
            self.mlx_tokenizer.encode(
                model_input[i], add_special_tokens=add_special_tokens[i]
            )
            for i in range(len(model_input))
        ]

        response = batch_generate(
            self.model,
            self.mlx_tokenizer,
            tokenized_model_input,
            **kwargs,
        )

        return response.texts

    def generate_stream(
        self,
        model_input: str,
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **kwargs,
    ) -> Iterator[str]:
        """Stream text using `mlx-lm`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        kwargs
            Additional keyword arguments to pass to the `mlx-lm` library.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        from mlx_lm import stream_generate

        for gen_response in stream_generate(
            self.model,
            self.mlx_tokenizer,
            self.type_adapter.format_input(model_input),
            logits_processors=self.type_adapter.format_output_type(output_type),
            **kwargs,
        ):
            yield gen_response.text

__init__(model, tokenizer)

Parameters:

Name Type Description Default
model Module

An instance of an mlx_lm model.

required
tokenizer PreTrainedTokenizer

An instance of an mlx_lm tokenizer or of a compatible transformers tokenizer.

required
Source code in outlines/models/mlxlm.py
def __init__(
    self,
    model: "nn.Module",
    tokenizer: "PreTrainedTokenizer",
):
    """
    Parameters
    ----------
    model
        An instance of an `mlx_lm` model.
    tokenizer
        An instance of an `mlx_lm` tokenizer or of a compatible
        `transformers` tokenizer.

    """
    self.model = model
    # self.mlx_tokenizer is used by the mlx-lm in its generate function
    self.mlx_tokenizer = tokenizer
    # self.tokenizer is used by the logits processor
    self.tokenizer = TransformerTokenizer(tokenizer._tokenizer)
    self.type_adapter = MLXLMTypeAdapter(
        tokenizer=tokenizer,
        has_chat_template=_check_hf_chat_template(tokenizer)
    )

generate(model_input, output_type=None, **kwargs)

Generate text using mlx-lm.

Parameters:

Name Type Description Default
model_input str

The prompt based on which the model will generate a response.

required
output_type Optional[OutlinesLogitsProcessor]

The logits processor the model will use to constrain the format of the generated text.

None
kwargs

Additional keyword arguments to pass to the mlx-lm library.

{}

Returns:

Type Description
str

The text generated by the model.

Source code in outlines/models/mlxlm.py
def generate(
    self,
    model_input: str,
    output_type: Optional[OutlinesLogitsProcessor] = None,
    **kwargs,
) -> str:
    """Generate text using `mlx-lm`.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The logits processor the model will use to constrain the format of
        the generated text.
    kwargs
        Additional keyword arguments to pass to the `mlx-lm` library.

    Returns
    -------
    str
        The text generated by the model.

    """
    from mlx_lm import generate

    return generate(
        self.model,
        self.mlx_tokenizer,
        self.type_adapter.format_input(model_input),
        logits_processors=self.type_adapter.format_output_type(output_type),
        **kwargs,
    )

generate_batch(model_input, output_type=None, **kwargs)

Generate a batch of text using mlx-lm.

Parameters:

Name Type Description Default
model_input list[str]

The list of prompts based on which the model will generate a response.

required
output_type Optional[OutlinesLogitsProcessor]

The logits processor the model will use to constrain the format of the generated text.

None
kwargs

Additional keyword arguments to pass to the mlx-lm library.

{}

Returns:

Type Description
list[str]

The list of text generated by the model.

Source code in outlines/models/mlxlm.py
def generate_batch(
    self,
    model_input: list[str],
    output_type: Optional[OutlinesLogitsProcessor] = None,
    **kwargs,
) -> list[str]:
    """Generate a batch of text using `mlx-lm`.

    Parameters
    ----------
    model_input
        The list of prompts based on which the model will generate a response.
    output_type
        The logits processor the model will use to constrain the format of
        the generated text.
    kwargs
        Additional keyword arguments to pass to the `mlx-lm` library.

    Returns
    -------
    list[str]
        The list of text generated by the model.

    """
    from mlx_lm import batch_generate

    if output_type:
        raise NotImplementedError(
            "mlx-lm does not support constrained generation with batching."
            + "You cannot provide an `output_type` with this method."
        )

    model_input = [self.type_adapter.format_input(item) for item in model_input]

    # Contrarily to the other generate methods, batch_generate requires
    # tokenized prompts
    add_special_tokens = [
        (
            self.mlx_tokenizer.bos_token is None
            or not prompt.startswith(self.mlx_tokenizer.bos_token)
        )
        for prompt in model_input
    ]
    tokenized_model_input = [
        self.mlx_tokenizer.encode(
            model_input[i], add_special_tokens=add_special_tokens[i]
        )
        for i in range(len(model_input))
    ]

    response = batch_generate(
        self.model,
        self.mlx_tokenizer,
        tokenized_model_input,
        **kwargs,
    )

    return response.texts

generate_stream(model_input, output_type=None, **kwargs)

Stream text using mlx-lm.

Parameters:

Name Type Description Default
model_input str

The prompt based on which the model will generate a response.

required
output_type Optional[OutlinesLogitsProcessor]

The logits processor the model will use to constrain the format of the generated text.

None
kwargs

Additional keyword arguments to pass to the mlx-lm library.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/mlxlm.py
def generate_stream(
    self,
    model_input: str,
    output_type: Optional[OutlinesLogitsProcessor] = None,
    **kwargs,
) -> Iterator[str]:
    """Stream text using `mlx-lm`.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The logits processor the model will use to constrain the format of
        the generated text.
    kwargs
        Additional keyword arguments to pass to the `mlx-lm` library.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    from mlx_lm import stream_generate

    for gen_response in stream_generate(
        self.model,
        self.mlx_tokenizer,
        self.type_adapter.format_input(model_input),
        logits_processors=self.type_adapter.format_output_type(output_type),
        **kwargs,
    ):
        yield gen_response.text

Mistral

Bases: Model

Thin wrapper around the mistralai.Mistral client.

Converts input and output types to arguments for the mistralai.Mistral client's chat.complete or chat.stream methods.

Source code in outlines/models/mistral.py
class Mistral(Model):
    """Thin wrapper around the `mistralai.Mistral` client.

    Converts input and output types to arguments for the `mistralai.Mistral`
    client's `chat.complete` or `chat.stream` methods.

    """

    def __init__(
        self, client: "MistralClient", model_name: Optional[str] = None
    ):
        """
        Parameters
        ----------
        client : MistralClient
            A mistralai.Mistral client instance.
        model_name : Optional[str]
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = MistralTypeAdapter()

    def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate a response from the model.

        Parameters
        ----------
        model_input : Union[Chat, list, str]
            The prompt or chat messages to generate a response from.
        output_type : Optional[Any]
            The desired format of the response (e.g., JSON schema).
        **inference_kwargs : Any
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The response generated by the model as text.

        """
        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            result = self.client.chat.complete(
                messages=messages,
                response_format=response_format,
                **inference_kwargs,
            )
        except Exception as e:
            if "schema" in str(e).lower() or "json_schema" in str(e).lower():
                raise TypeError(
                    f"Mistral does not support your schema: {e}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise RuntimeError(f"Mistral API error: {e}") from e

        outputs = [choice.message for choice in result.choices]

        if len(outputs) == 1:
            return outputs[0].content
        else:
            return [m.content for m in outputs]

    def generate_batch(
        self,
        model_input,
        output_type=None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "The `mistralai` library does not support batch inference."
        )

    def generate_stream(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs,
    ) -> Iterator[str]:
        """Generate a stream of responses from the model.

        Parameters
        ----------
        model_input : Union[Chat, list, str]
            The prompt or chat messages to generate a response from.
        output_type : Optional[Any]
            The desired format of the response (e.g., JSON schema).
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text chunks generated by the model.

        """
        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            stream = self.client.chat.stream(
                messages=messages,
                response_format=response_format,
                **inference_kwargs
            )
        except Exception as e:
            if "schema" in str(e).lower() or "json_schema" in str(e).lower():
                raise TypeError(
                    f"Mistral does not support your schema: {e}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise RuntimeError(f"Mistral API error: {e}") from e

        for chunk in stream:
            if (
                hasattr(chunk, "data")
                and chunk.data.choices
                and chunk.data.choices[0].delta.content is not None
            ):
                yield chunk.data.choices[0].delta.content

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client Mistral

A mistralai.Mistral client instance.

required
model_name Optional[str]

The name of the model to use.

None
Source code in outlines/models/mistral.py
def __init__(
    self, client: "MistralClient", model_name: Optional[str] = None
):
    """
    Parameters
    ----------
    client : MistralClient
        A mistralai.Mistral client instance.
    model_name : Optional[str]
        The name of the model to use.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = MistralTypeAdapter()

generate(model_input, output_type=None, **inference_kwargs)

Generate a response from the model.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt or chat messages to generate a response from.

required
output_type Optional[Any]

The desired format of the response (e.g., JSON schema).

None
**inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Union[str, list[str]]

The response generated by the model as text.

Source code in outlines/models/mistral.py
def generate(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Union[str, list[str]]:
    """Generate a response from the model.

    Parameters
    ----------
    model_input : Union[Chat, list, str]
        The prompt or chat messages to generate a response from.
    output_type : Optional[Any]
        The desired format of the response (e.g., JSON schema).
    **inference_kwargs : Any
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Union[str, list[str]]
        The response generated by the model as text.

    """
    messages = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if "model" not in inference_kwargs and self.model_name is not None:
        inference_kwargs["model"] = self.model_name

    try:
        result = self.client.chat.complete(
            messages=messages,
            response_format=response_format,
            **inference_kwargs,
        )
    except Exception as e:
        if "schema" in str(e).lower() or "json_schema" in str(e).lower():
            raise TypeError(
                f"Mistral does not support your schema: {e}. "
                "Try a local model or dottxt instead."
            )
        else:
            raise RuntimeError(f"Mistral API error: {e}") from e

    outputs = [choice.message for choice in result.choices]

    if len(outputs) == 1:
        return outputs[0].content
    else:
        return [m.content for m in outputs]

generate_stream(model_input, output_type=None, **inference_kwargs)

Generate a stream of responses from the model.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt or chat messages to generate a response from.

required
output_type Optional[Any]

The desired format of the response (e.g., JSON schema).

None
**inference_kwargs

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text chunks generated by the model.

Source code in outlines/models/mistral.py
def generate_stream(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Any] = None,
    **inference_kwargs,
) -> Iterator[str]:
    """Generate a stream of responses from the model.

    Parameters
    ----------
    model_input : Union[Chat, list, str]
        The prompt or chat messages to generate a response from.
    output_type : Optional[Any]
        The desired format of the response (e.g., JSON schema).
    **inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text chunks generated by the model.

    """
    messages = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if "model" not in inference_kwargs and self.model_name is not None:
        inference_kwargs["model"] = self.model_name

    try:
        stream = self.client.chat.stream(
            messages=messages,
            response_format=response_format,
            **inference_kwargs
        )
    except Exception as e:
        if "schema" in str(e).lower() or "json_schema" in str(e).lower():
            raise TypeError(
                f"Mistral does not support your schema: {e}. "
                "Try a local model or dottxt instead."
            )
        else:
            raise RuntimeError(f"Mistral API error: {e}") from e

    for chunk in stream:
        if (
            hasattr(chunk, "data")
            and chunk.data.choices
            and chunk.data.choices[0].delta.content is not None
        ):
            yield chunk.data.choices[0].delta.content

Model

Bases: ABC

Base class for all synchronous models.

This class defines shared __call__, batch and stream methods that can be used to call the model directly. The generate, generate_batch, and generate_stream methods must be implemented by the subclasses. All models inheriting from this class must define a type_adapter attribute of type ModelTypeAdapter. The methods of the type_adapter attribute are used in the generate, generate_batch, and generate_stream methods to format the input and output types received by the model. Additionally, steerable models must define a tensor_library_name attribute.

Source code in outlines/models/base.py
class Model(ABC):
    """Base class for all synchronous models.

    This class defines shared `__call__`, `batch` and `stream` methods that can
    be used to call the model directly. The `generate`, `generate_batch`, and
    `generate_stream` methods must be implemented by the subclasses.
    All models inheriting from this class must define a `type_adapter`
    attribute of type `ModelTypeAdapter`. The methods of the `type_adapter`
    attribute are used in the `generate`, `generate_batch`, and
    `generate_stream` methods to format the input and output types received by
    the model.
    Additionally, steerable models must define a `tensor_library_name`
    attribute.

    """
    type_adapter: ModelTypeAdapter
    tensor_library_name: str

    def __call__(
        self,
        model_input: Any,
        output_type: Optional[Any] = None,
        backend: Optional[str] = None,
        **inference_kwargs: Any
    ) -> Any:
        """Call the model.

        Users can call the model directly, in which case we will create a
        generator instance with the output type provided and call it.
        Thus, those commands are equivalent:
        ```python
        generator = Generator(model, Foo)
        generator("prompt")
        ```
        and
        ```python
        model("prompt", Foo)
        ```

        Parameters
        ----------
        model_input
            The input provided by the user.
        output_type
            The output type provided by the user.
        backend
            The name of the backend to use to create the logits processor that
            will be used to generate the response. Only used for steerable
            models if `output_type` is provided.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        from outlines.generator import Generator

        return Generator(self, output_type, backend)(model_input, **inference_kwargs)

    def batch(
        self,
        model_input: List[Any],
        output_type: Optional[Any] = None,
        backend: Optional[str] = None,
        **inference_kwargs: Any
    ) -> List[Any]:
        """Make a batch call to the model (several inputs at once).

        Users can use the `batch` method from the model directly, in which
        case we will create a generator instance with the output type provided
        and then invoke its `batch` method.
        Thus, those commands are equivalent:
        ```python
        generator = Generator(model, Foo)
        generator.batch(["prompt1", "prompt2"])
        ```
        and
        ```python
        model.batch(["prompt1", "prompt2"], Foo)
        ```

        Parameters
        ----------
        model_input
            The list of inputs provided by the user.
        output_type
            The output type provided by the user.
        backend
            The name of the backend to use to create the logits processor that
            will be used to generate the response. Only used for steerable
            models if `output_type` is provided.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        List[Any]
            The list of responses generated by the model.

        """
        from outlines import Generator

        generator = Generator(self, output_type, backend)
        return generator.batch(model_input, **inference_kwargs) # type: ignore

    def stream(
        self,
        model_input: Any,
        output_type: Optional[Any] = None,
        backend: Optional[str] = None,
        **inference_kwargs: Any
    ) -> Iterator[Any]:
        """Stream a response from the model.

        Users can use the `stream` method from the model directly, in which
        case we will create a generator instance with the output type provided
        and then invoke its `stream` method.
        Thus, those commands are equivalent:
        ```python
        generator = Generator(model, Foo)
        for chunk in generator("prompt"):
            print(chunk)
        ```
        and
        ```python
        for chunk in model.stream("prompt", Foo):
            print(chunk)
        ```

        Parameters
        ----------
        model_input
            The input provided by the user.
        output_type
            The output type provided by the user.
        backend
            The name of the backend to use to create the logits processor that
            will be used to generate the response. Only used for steerable
            models if `output_type` is provided.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Iterator[Any]
            A stream of responses from the model.

        """
        from outlines import Generator

        generator = Generator(self, output_type, backend)
        return generator.stream(model_input, **inference_kwargs) # type: ignore

    @abstractmethod
    def generate(
        self,
        model_input: Any,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any
    ) -> Any:
        """Generate a response from the model.

        The output_type argument contains a logits processor for steerable
        models while it contains a type (Json, Enum...) for black-box models.
        This method is not intended to be used directly by end users.

        Parameters
        ----------
        model_input
            The input provided by the user.
        output_type
            The output type provided by the user.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        ...

    @abstractmethod
    def generate_batch(
        self,
        model_input: List[Any],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any
    ) -> List[Any]:
        """Generate a batch of responses from the model.

        The output_type argument contains a logits processor for steerable
        models while it contains a type (Json, Enum...) for black-box models.
        This method is not intended to be used directly by end users.

        Parameters
        ----------
        model_input
            The list of inputs provided by the user.
        output_type
            The output type provided by the user.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        List[Any]
            The list of responses generated by the model.

        """
        ...
    @abstractmethod
    def generate_stream(
        self,
        model_input: Any,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any
    ) -> Iterator[Any]:
        """Generate a stream of responses from the model.

        The output_type argument contains a logits processor for steerable
        models while it contains a type (Json, Enum...) for black-box models.
        This method is not intended to be used directly by end users.

        Parameters
        ----------
        model_input
            The input provided by the user.
        output_type
            The output type provided by the user.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Iterator[Any]
            A stream of responses from the model.

        """
        ...

__call__(model_input, output_type=None, backend=None, **inference_kwargs)

Call the model.

Users can call the model directly, in which case we will create a generator instance with the output type provided and call it. Thus, those commands are equivalent:

generator = Generator(model, Foo)
generator("prompt")
and
model("prompt", Foo)

Parameters:

Name Type Description Default
model_input Any

The input provided by the user.

required
output_type Optional[Any]

The output type provided by the user.

None
backend Optional[str]

The name of the backend to use to create the logits processor that will be used to generate the response. Only used for steerable models if output_type is provided.

None
**inference_kwargs Any

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
Any

The response generated by the model.

Source code in outlines/models/base.py
def __call__(
    self,
    model_input: Any,
    output_type: Optional[Any] = None,
    backend: Optional[str] = None,
    **inference_kwargs: Any
) -> Any:
    """Call the model.

    Users can call the model directly, in which case we will create a
    generator instance with the output type provided and call it.
    Thus, those commands are equivalent:
    ```python
    generator = Generator(model, Foo)
    generator("prompt")
    ```
    and
    ```python
    model("prompt", Foo)
    ```

    Parameters
    ----------
    model_input
        The input provided by the user.
    output_type
        The output type provided by the user.
    backend
        The name of the backend to use to create the logits processor that
        will be used to generate the response. Only used for steerable
        models if `output_type` is provided.
    **inference_kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    Any
        The response generated by the model.

    """
    from outlines.generator import Generator

    return Generator(self, output_type, backend)(model_input, **inference_kwargs)

batch(model_input, output_type=None, backend=None, **inference_kwargs)

Make a batch call to the model (several inputs at once).

Users can use the batch method from the model directly, in which case we will create a generator instance with the output type provided and then invoke its batch method. Thus, those commands are equivalent:

generator = Generator(model, Foo)
generator.batch(["prompt1", "prompt2"])
and
model.batch(["prompt1", "prompt2"], Foo)

Parameters:

Name Type Description Default
model_input List[Any]

The list of inputs provided by the user.

required
output_type Optional[Any]

The output type provided by the user.

None
backend Optional[str]

The name of the backend to use to create the logits processor that will be used to generate the response. Only used for steerable models if output_type is provided.

None
**inference_kwargs Any

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
List[Any]

The list of responses generated by the model.

Source code in outlines/models/base.py
def batch(
    self,
    model_input: List[Any],
    output_type: Optional[Any] = None,
    backend: Optional[str] = None,
    **inference_kwargs: Any
) -> List[Any]:
    """Make a batch call to the model (several inputs at once).

    Users can use the `batch` method from the model directly, in which
    case we will create a generator instance with the output type provided
    and then invoke its `batch` method.
    Thus, those commands are equivalent:
    ```python
    generator = Generator(model, Foo)
    generator.batch(["prompt1", "prompt2"])
    ```
    and
    ```python
    model.batch(["prompt1", "prompt2"], Foo)
    ```

    Parameters
    ----------
    model_input
        The list of inputs provided by the user.
    output_type
        The output type provided by the user.
    backend
        The name of the backend to use to create the logits processor that
        will be used to generate the response. Only used for steerable
        models if `output_type` is provided.
    **inference_kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    List[Any]
        The list of responses generated by the model.

    """
    from outlines import Generator

    generator = Generator(self, output_type, backend)
    return generator.batch(model_input, **inference_kwargs) # type: ignore

generate(model_input, output_type=None, **inference_kwargs) abstractmethod

Generate a response from the model.

The output_type argument contains a logits processor for steerable models while it contains a type (Json, Enum...) for black-box models. This method is not intended to be used directly by end users.

Parameters:

Name Type Description Default
model_input Any

The input provided by the user.

required
output_type Optional[Any]

The output type provided by the user.

None
**inference_kwargs Any

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
Any

The response generated by the model.

Source code in outlines/models/base.py
@abstractmethod
def generate(
    self,
    model_input: Any,
    output_type: Optional[Any] = None,
    **inference_kwargs: Any
) -> Any:
    """Generate a response from the model.

    The output_type argument contains a logits processor for steerable
    models while it contains a type (Json, Enum...) for black-box models.
    This method is not intended to be used directly by end users.

    Parameters
    ----------
    model_input
        The input provided by the user.
    output_type
        The output type provided by the user.
    **inference_kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    Any
        The response generated by the model.

    """
    ...

generate_batch(model_input, output_type=None, **inference_kwargs) abstractmethod

Generate a batch of responses from the model.

The output_type argument contains a logits processor for steerable models while it contains a type (Json, Enum...) for black-box models. This method is not intended to be used directly by end users.

Parameters:

Name Type Description Default
model_input List[Any]

The list of inputs provided by the user.

required
output_type Optional[Any]

The output type provided by the user.

None
**inference_kwargs Any

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
List[Any]

The list of responses generated by the model.

Source code in outlines/models/base.py
@abstractmethod
def generate_batch(
    self,
    model_input: List[Any],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any
) -> List[Any]:
    """Generate a batch of responses from the model.

    The output_type argument contains a logits processor for steerable
    models while it contains a type (Json, Enum...) for black-box models.
    This method is not intended to be used directly by end users.

    Parameters
    ----------
    model_input
        The list of inputs provided by the user.
    output_type
        The output type provided by the user.
    **inference_kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    List[Any]
        The list of responses generated by the model.

    """
    ...

generate_stream(model_input, output_type=None, **inference_kwargs) abstractmethod

Generate a stream of responses from the model.

The output_type argument contains a logits processor for steerable models while it contains a type (Json, Enum...) for black-box models. This method is not intended to be used directly by end users.

Parameters:

Name Type Description Default
model_input Any

The input provided by the user.

required
output_type Optional[Any]

The output type provided by the user.

None
**inference_kwargs Any

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
Iterator[Any]

A stream of responses from the model.

Source code in outlines/models/base.py
@abstractmethod
def generate_stream(
    self,
    model_input: Any,
    output_type: Optional[Any] = None,
    **inference_kwargs: Any
) -> Iterator[Any]:
    """Generate a stream of responses from the model.

    The output_type argument contains a logits processor for steerable
    models while it contains a type (Json, Enum...) for black-box models.
    This method is not intended to be used directly by end users.

    Parameters
    ----------
    model_input
        The input provided by the user.
    output_type
        The output type provided by the user.
    **inference_kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    Iterator[Any]
        A stream of responses from the model.

    """
    ...

stream(model_input, output_type=None, backend=None, **inference_kwargs)

Stream a response from the model.

Users can use the stream method from the model directly, in which case we will create a generator instance with the output type provided and then invoke its stream method. Thus, those commands are equivalent:

generator = Generator(model, Foo)
for chunk in generator("prompt"):
    print(chunk)
and
for chunk in model.stream("prompt", Foo):
    print(chunk)

Parameters:

Name Type Description Default
model_input Any

The input provided by the user.

required
output_type Optional[Any]

The output type provided by the user.

None
backend Optional[str]

The name of the backend to use to create the logits processor that will be used to generate the response. Only used for steerable models if output_type is provided.

None
**inference_kwargs Any

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
Iterator[Any]

A stream of responses from the model.

Source code in outlines/models/base.py
def stream(
    self,
    model_input: Any,
    output_type: Optional[Any] = None,
    backend: Optional[str] = None,
    **inference_kwargs: Any
) -> Iterator[Any]:
    """Stream a response from the model.

    Users can use the `stream` method from the model directly, in which
    case we will create a generator instance with the output type provided
    and then invoke its `stream` method.
    Thus, those commands are equivalent:
    ```python
    generator = Generator(model, Foo)
    for chunk in generator("prompt"):
        print(chunk)
    ```
    and
    ```python
    for chunk in model.stream("prompt", Foo):
        print(chunk)
    ```

    Parameters
    ----------
    model_input
        The input provided by the user.
    output_type
        The output type provided by the user.
    backend
        The name of the backend to use to create the logits processor that
        will be used to generate the response. Only used for steerable
        models if `output_type` is provided.
    **inference_kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    Iterator[Any]
        A stream of responses from the model.

    """
    from outlines import Generator

    generator = Generator(self, output_type, backend)
    return generator.stream(model_input, **inference_kwargs) # type: ignore

ModelTypeAdapter

Bases: ABC

Base class for all model type adapters.

A type adapter instance must be given as a value to the type_adapter attribute when instantiating a model. The type adapter is responsible for formatting the input and output types passed to the model to match the specific format expected by the associated model.

Source code in outlines/models/base.py
class ModelTypeAdapter(ABC):
    """Base class for all model type adapters.

    A type adapter instance must be given as a value to the `type_adapter`
    attribute when instantiating a model.
    The type adapter is responsible for formatting the input and output types
    passed to the model to match the specific format expected by the
    associated model.

    """

    @abstractmethod
    def format_input(self, model_input: Any) -> Any:
        """Format the user input to the expected format of the model.

        For API-based models, it typically means creating the `messages`
        argument passed to the client. For local models, it can mean casting
        the input from str to list for instance.
        This method is also used to validate that the input type provided by
        the user is supported by the model.

        Parameters
        ----------
        model_input
            The input provided by the user.

        Returns
        -------
        Any
            The formatted input to be passed to the model.

        """
        ...

    @abstractmethod
    def format_output_type(self, output_type: Optional[Any] = None) -> Any:
        """Format the output type to the expected format of the model.

        For black-box models, this typically means creating a `response_format`
        argument. For steerable models, it means formatting the logits processor
        to create the object type expected by the model.

        Parameters
        ----------
        output_type
            The output type provided by the user.

        Returns
        -------
        Any
            The formatted output type to be passed to the model.

        """
        ...

format_input(model_input) abstractmethod

Format the user input to the expected format of the model.

For API-based models, it typically means creating the messages argument passed to the client. For local models, it can mean casting the input from str to list for instance. This method is also used to validate that the input type provided by the user is supported by the model.

Parameters:

Name Type Description Default
model_input Any

The input provided by the user.

required

Returns:

Type Description
Any

The formatted input to be passed to the model.

Source code in outlines/models/base.py
@abstractmethod
def format_input(self, model_input: Any) -> Any:
    """Format the user input to the expected format of the model.

    For API-based models, it typically means creating the `messages`
    argument passed to the client. For local models, it can mean casting
    the input from str to list for instance.
    This method is also used to validate that the input type provided by
    the user is supported by the model.

    Parameters
    ----------
    model_input
        The input provided by the user.

    Returns
    -------
    Any
        The formatted input to be passed to the model.

    """
    ...

format_output_type(output_type=None) abstractmethod

Format the output type to the expected format of the model.

For black-box models, this typically means creating a response_format argument. For steerable models, it means formatting the logits processor to create the object type expected by the model.

Parameters:

Name Type Description Default
output_type Optional[Any]

The output type provided by the user.

None

Returns:

Type Description
Any

The formatted output type to be passed to the model.

Source code in outlines/models/base.py
@abstractmethod
def format_output_type(self, output_type: Optional[Any] = None) -> Any:
    """Format the output type to the expected format of the model.

    For black-box models, this typically means creating a `response_format`
    argument. For steerable models, it means formatting the logits processor
    to create the object type expected by the model.

    Parameters
    ----------
    output_type
        The output type provided by the user.

    Returns
    -------
    Any
        The formatted output type to be passed to the model.

    """
    ...

Ollama

Bases: Model

Thin wrapper around the ollama.Client client.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the ollama.Client client.

Source code in outlines/models/ollama.py
class Ollama(Model):
    """Thin wrapper around the `ollama.Client` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `ollama.Client` client.

    """

    def __init__(self, client: "Client", model_name: Optional[str] = None):
        """
        Parameters
        ----------
        client
            The `ollama.Client` client.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = OllamaTypeAdapter()

    def generate(self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> str:
        """Generate text using Ollama.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The text generated by the model.

        """
        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        print(self.type_adapter.format_input(model_input))

        response = self.client.chat(
            messages=self.type_adapter.format_input(model_input),
            format=self.type_adapter.format_output_type(output_type),
            **kwargs,
        )
        return response.message.content

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **kwargs,
    ):
        raise NotImplementedError(
            "The `ollama` library does not support batch inference."
        )

    def generate_stream(
        self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using Ollama.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        response = self.client.chat(
            messages=self.type_adapter.format_input(model_input),
            format=self.type_adapter.format_output_type(output_type),
            stream=True,
            **kwargs,
        )
        for chunk in response:
            yield chunk.message.content

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client Client

The ollama.Client client.

required
model_name Optional[str]

The name of the model to use.

None
Source code in outlines/models/ollama.py
def __init__(self, client: "Client", model_name: Optional[str] = None):
    """
    Parameters
    ----------
    client
        The `ollama.Client` client.
    model_name
        The name of the model to use.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = OllamaTypeAdapter()

generate(model_input, output_type=None, **kwargs)

Generate text using Ollama.

Parameters:

Name Type Description Default
model_input Chat | str | list

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema.

None
**kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
str

The text generated by the model.

Source code in outlines/models/ollama.py
def generate(self,
    model_input: Chat | str | list,
    output_type: Optional[Any] = None,
    **kwargs: Any,
) -> str:
    """Generate text using Ollama.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema.
    **kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    str
        The text generated by the model.

    """
    if "model" not in kwargs and self.model_name is not None:
        kwargs["model"] = self.model_name

    print(self.type_adapter.format_input(model_input))

    response = self.client.chat(
        messages=self.type_adapter.format_input(model_input),
        format=self.type_adapter.format_output_type(output_type),
        **kwargs,
    )
    return response.message.content

generate_stream(model_input, output_type=None, **kwargs)

Stream text using Ollama.

Parameters:

Name Type Description Default
model_input Chat | str | list

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema.

None
**kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/ollama.py
def generate_stream(
    self,
    model_input: Chat | str | list,
    output_type: Optional[Any] = None,
    **kwargs: Any,
) -> Iterator[str]:
    """Stream text using Ollama.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema.
    **kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    if "model" not in kwargs and self.model_name is not None:
        kwargs["model"] = self.model_name

    response = self.client.chat(
        messages=self.type_adapter.format_input(model_input),
        format=self.type_adapter.format_output_type(output_type),
        stream=True,
        **kwargs,
    )
    for chunk in response:
        yield chunk.message.content

OpenAI

Bases: Model

Thin wrapper around the openai.OpenAI client.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the openai.OpenAI client.

Source code in outlines/models/openai.py
class OpenAI(Model):
    """Thin wrapper around the `openai.OpenAI` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.OpenAI` client.

    """

    def __init__(
        self,
        client: Union["OpenAIClient", "AzureOpenAIClient"],
        model_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            The `openai.OpenAI` client.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = OpenAITypeAdapter()

    def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Union[type[BaseModel], str]] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using OpenAI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema or an empty dictionary.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        import openai

        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            result = self.client.chat.completions.create(
                messages=messages,
                **response_format,
                **inference_kwargs,
            )
        except openai.BadRequestError as e:
            if e.body["message"].startswith("Invalid schema"):
                raise TypeError(
                    f"OpenAI does not support your schema: {e.body['message']}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise e

        messages = [choice.message for choice in result.choices]
        for message in messages:
            if message.refusal is not None:
                raise ValueError(
                    f"OpenAI refused to answer the request: {message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "The `openai` library does not support batch inference."
        )

    def generate_stream(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Union[type[BaseModel], str]] = None,
        **inference_kwargs,
    ) -> Iterator[str]:
        """Stream text using OpenAI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema or an empty dictionary.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        import openai

        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            stream = self.client.chat.completions.create(
                stream=True,
                messages=messages,
                **response_format,
                **inference_kwargs
            )
        except openai.BadRequestError as e:
            if e.body["message"].startswith("Invalid schema"):
                raise TypeError(
                    f"OpenAI does not support your schema: {e.body['message']}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise e

        for chunk in stream:
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client Union[OpenAI, AzureOpenAI]

The openai.OpenAI client.

required
model_name Optional[str]

The name of the model to use.

None
Source code in outlines/models/openai.py
def __init__(
    self,
    client: Union["OpenAIClient", "AzureOpenAIClient"],
    model_name: Optional[str] = None,
):
    """
    Parameters
    ----------
    client
        The `openai.OpenAI` client.
    model_name
        The name of the model to use.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = OpenAITypeAdapter()

generate(model_input, output_type=None, **inference_kwargs)

Generate text using OpenAI.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt based on which the model will generate a response.

required
output_type Optional[Union[type[BaseModel], str]]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema or an empty dictionary.

None
**inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Union[str, list[str]]

The text generated by the model.

Source code in outlines/models/openai.py
def generate(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Union[type[BaseModel], str]] = None,
    **inference_kwargs: Any,
) -> Union[str, list[str]]:
    """Generate text using OpenAI.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema or an empty dictionary.
    **inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Union[str, list[str]]
        The text generated by the model.

    """
    import openai

    messages = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if "model" not in inference_kwargs and self.model_name is not None:
        inference_kwargs["model"] = self.model_name

    try:
        result = self.client.chat.completions.create(
            messages=messages,
            **response_format,
            **inference_kwargs,
        )
    except openai.BadRequestError as e:
        if e.body["message"].startswith("Invalid schema"):
            raise TypeError(
                f"OpenAI does not support your schema: {e.body['message']}. "
                "Try a local model or dottxt instead."
            )
        else:
            raise e

    messages = [choice.message for choice in result.choices]
    for message in messages:
        if message.refusal is not None:
            raise ValueError(
                f"OpenAI refused to answer the request: {message.refusal}"
            )

    if len(messages) == 1:
        return messages[0].content
    else:
        return [message.content for message in messages]

generate_stream(model_input, output_type=None, **inference_kwargs)

Stream text using OpenAI.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt based on which the model will generate a response.

required
output_type Optional[Union[type[BaseModel], str]]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema or an empty dictionary.

None
**inference_kwargs

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/openai.py
def generate_stream(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Union[type[BaseModel], str]] = None,
    **inference_kwargs,
) -> Iterator[str]:
    """Stream text using OpenAI.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema or an empty dictionary.
    **inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    import openai

    messages = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if "model" not in inference_kwargs and self.model_name is not None:
        inference_kwargs["model"] = self.model_name

    try:
        stream = self.client.chat.completions.create(
            stream=True,
            messages=messages,
            **response_format,
            **inference_kwargs
        )
    except openai.BadRequestError as e:
        if e.body["message"].startswith("Invalid schema"):
            raise TypeError(
                f"OpenAI does not support your schema: {e.body['message']}. "
                "Try a local model or dottxt instead."
            )
        else:
            raise e

    for chunk in stream:
        if chunk.choices and chunk.choices[0].delta.content is not None:
            yield chunk.choices[0].delta.content

SGLang

Bases: Model

Thin wrapper around the openai.OpenAI client used to communicate with an SGLang server.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the openai.OpenAI client for the SGLang server.

Source code in outlines/models/sglang.py
class SGLang(Model):
    """Thin wrapper around the `openai.OpenAI` client used to communicate with
    an SGLang server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.OpenAI` client for the
    SGLang server.

    """

    def __init__(self, client, model_name: Optional[str] = None):
        """
        Parameters
        ----------
        client
            An `openai.OpenAI` client instance.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = SGLangTypeAdapter()

    def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using SGLang.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input,
            output_type,
            **inference_kwargs,
        )

        response = self.client.chat.completions.create(**client_args)

        messages = [choice.message for choice in response.choices]
        for message in messages:
            if message.refusal is not None:  # pragma: no cover
                raise ValueError(
                    f"The SGLang server refused to answer the request: "
                    f"{message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "SGLang does not support batch inference."
        )

    def generate_stream(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using SGLang.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = self.client.chat.completions.create(
            **client_args, stream=True,
        )

        for chunk in stream:  # pragma: no cover
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

    def _build_client_args(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the SGLang client."""
        messages = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        inference_kwargs.update(output_type_args)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        client_args = {
            "messages": messages,
            **inference_kwargs,
        }

        return client_args

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client

An openai.OpenAI client instance.

required
model_name Optional[str]

The name of the model to use.

None
Source code in outlines/models/sglang.py
def __init__(self, client, model_name: Optional[str] = None):
    """
    Parameters
    ----------
    client
        An `openai.OpenAI` client instance.
    model_name
        The name of the model to use.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = SGLangTypeAdapter()

generate(model_input, output_type=None, **inference_kwargs)

Generate text using SGLang.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Union[str, list[str]]

The text generated by the model.

Source code in outlines/models/sglang.py
def generate(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Union[str, list[str]]:
    """Generate text using SGLang.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Union[str, list[str]]
        The text generated by the model.

    """
    client_args = self._build_client_args(
        model_input,
        output_type,
        **inference_kwargs,
    )

    response = self.client.chat.completions.create(**client_args)

    messages = [choice.message for choice in response.choices]
    for message in messages:
        if message.refusal is not None:  # pragma: no cover
            raise ValueError(
                f"The SGLang server refused to answer the request: "
                f"{message.refusal}"
            )

    if len(messages) == 1:
        return messages[0].content
    else:
        return [message.content for message in messages]

generate_stream(model_input, output_type=None, **inference_kwargs)

Stream text using SGLang.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/sglang.py
def generate_stream(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Iterator[str]:
    """Stream text using SGLang.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    stream = self.client.chat.completions.create(
        **client_args, stream=True,
    )

    for chunk in stream:  # pragma: no cover
        if chunk.choices and chunk.choices[0].delta.content is not None:
            yield chunk.choices[0].delta.content

TGI

Bases: Model

Thin wrapper around a huggingface_hub.InferenceClient client used to communicate with a TGI server.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the huggingface_hub.InferenceClient client.

Source code in outlines/models/tgi.py
class TGI(Model):
    """Thin wrapper around a `huggingface_hub.InferenceClient` client used to
    communicate with a `TGI` server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the
    `huggingface_hub.InferenceClient` client.

    """

    def __init__(self, client):
        """
        Parameters
        ----------
        client
            A huggingface `InferenceClient` client instance.

        """
        self.client = client
        self.type_adapter = TGITypeAdapter()

    def generate(
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> str:
        """Generate text using TGI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types except `CFG` are supported provided your server uses
            a backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input,
            output_type,
            **inference_kwargs,
        )

        return self.client.text_generation(**client_args)

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError("TGI does not support batch inference.")

    def generate_stream(
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using TGI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types except `CFG` are supported provided your server uses
            a backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = self.client.text_generation(
            **client_args, stream=True,
        )

        for chunk in stream:  # pragma: no cover
            yield chunk

    def _build_client_args(
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the TGI client."""
        prompt = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        inference_kwargs.update(output_type_args)

        client_args = {
            "prompt": prompt,
            **inference_kwargs,
        }

        return client_args

__init__(client)

Parameters:

Name Type Description Default
client

A huggingface InferenceClient client instance.

required
Source code in outlines/models/tgi.py
def __init__(self, client):
    """
    Parameters
    ----------
    client
        A huggingface `InferenceClient` client instance.

    """
    self.client = client
    self.type_adapter = TGITypeAdapter()

generate(model_input, output_type=None, **inference_kwargs)

Generate text using TGI.

Parameters:

Name Type Description Default
model_input str

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types except CFG are supported provided your server uses a backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
str

The text generated by the model.

Source code in outlines/models/tgi.py
def generate(
    self,
    model_input: str,
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> str:
    """Generate text using TGI.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types except `CFG` are supported provided your server uses
        a backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    str
        The text generated by the model.

    """
    client_args = self._build_client_args(
        model_input,
        output_type,
        **inference_kwargs,
    )

    return self.client.text_generation(**client_args)

generate_stream(model_input, output_type=None, **inference_kwargs)

Stream text using TGI.

Parameters:

Name Type Description Default
model_input str

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types except CFG are supported provided your server uses a backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/tgi.py
def generate_stream(
    self,
    model_input: str,
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Iterator[str]:
    """Stream text using TGI.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types except `CFG` are supported provided your server uses
        a backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    stream = self.client.text_generation(
        **client_args, stream=True,
    )

    for chunk in stream:  # pragma: no cover
        yield chunk

TransformerTokenizer

Bases: Tokenizer

Represents a tokenizer for models in the transformers library.

Source code in outlines/models/transformers.py
class TransformerTokenizer(Tokenizer):
    """Represents a tokenizer for models in the `transformers` library."""

    def __init__(self, tokenizer: "PreTrainedTokenizer", **kwargs):
        self.tokenizer = tokenizer
        self.eos_token_id = self.tokenizer.eos_token_id
        self.eos_token = self.tokenizer.eos_token
        self.get_vocab = self.tokenizer.get_vocab

        if self.tokenizer.pad_token_id is None:
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
            self.pad_token_id = self.eos_token_id
        else:
            self.pad_token_id = self.tokenizer.pad_token_id
            self.pad_token = self.tokenizer.pad_token

        self.special_tokens = set(self.tokenizer.all_special_tokens)

        self.vocabulary = self.tokenizer.get_vocab()
        self.is_llama = isinstance(self.tokenizer, get_llama_tokenizer_types())

    def encode(
        self, prompt: Union[str, List[str]], **kwargs
    ) -> Tuple["torch.LongTensor", "torch.LongTensor"]:
        kwargs["padding"] = True
        kwargs["return_tensors"] = "pt"
        output = self.tokenizer(prompt, **kwargs)
        return output["input_ids"], output["attention_mask"]

    def decode(self, token_ids: "torch.LongTensor") -> List[str]:
        text = self.tokenizer.batch_decode(token_ids, skip_special_tokens=True)
        return text

    def convert_token_to_string(self, token: str) -> str:
        from transformers.file_utils import SPIECE_UNDERLINE

        string = self.tokenizer.convert_tokens_to_string([token])

        if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
            return " " + string

        return string

    def __eq__(self, other):
        if isinstance(other, type(self)):
            if hasattr(self, "model_name") and hasattr(self, "kwargs"):
                return (
                    other.model_name == self.model_name and other.kwargs == self.kwargs
                )
            else:
                return other.tokenizer == self.tokenizer
        return NotImplemented

    def __hash__(self):
        from datasets.fingerprint import Hasher

        return hash(Hasher.hash(self.tokenizer))

    def __getstate__(self):
        state = {"tokenizer": self.tokenizer}
        return state

    def __setstate__(self, state):
        self.__init__(state["tokenizer"])

Transformers

Bases: Model

Thin wrapper around a transformers model and a transformers tokenizer.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the transformers model and tokenizer.

Source code in outlines/models/transformers.py
class Transformers(Model):
    """Thin wrapper around a `transformers` model and a `transformers`
    tokenizer.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `transformers` model and
    tokenizer.

    """

    def __init__(
        self,
        model: "PreTrainedModel",
        tokenizer: "PreTrainedTokenizer",
        *,
        device_dtype: Optional["torch.dtype"] = None,
    ):
        """
        Parameters:
        ----------
        model
            A `PreTrainedModel`, or any model that is compatible with the
            `transformers` API for models.
        tokenizer
            A `PreTrainedTokenizer`, or any tokenizer that is compatible with
            the `transformers` API for tokenizers.
        device_dtype
            The dtype to use for the model. If not provided, the model will use
            the default dtype.

        """
        # We need to handle the cases in which jax/flax or tensorflow
        # is not available in the environment.
        try:
            from transformers import FlaxPreTrainedModel
        except ImportError:  # pragma: no cover
            FlaxPreTrainedModel = None

        try:
            from transformers import TFPreTrainedModel
        except ImportError:  # pragma: no cover
            TFPreTrainedModel = None

        tokenizer.padding_side = "left"
        self.model = model
        self.hf_tokenizer = tokenizer
        self.tokenizer = TransformerTokenizer(tokenizer)
        self.device_dtype = device_dtype
        self.type_adapter = TransformersTypeAdapter(
            tokenizer=tokenizer,
            has_chat_template=_check_hf_chat_template(tokenizer)
        )

        if (
            FlaxPreTrainedModel is not None
            and isinstance(model, FlaxPreTrainedModel)
        ):  # pragma: no cover
            self.tensor_library_name = "jax"
            warnings.warn("""
                Support for `jax` has been deprecated and will be removed in
                version 1.4.0 of Outlines. Please use `torch` instead.
                Transformers models using `jax` do not support structured
                generation.
                """,
                DeprecationWarning,
                stacklevel=2,
            )
        elif (
            TFPreTrainedModel is not None
            and isinstance(model, TFPreTrainedModel)
        ):  # pragma: no cover
            self.tensor_library_name = "tensorflow"
            warnings.warn("""
                Support for `tensorflow` has been deprecated and will be removed in
                version 1.4.0 of Outlines. Please use `torch` instead.
                Transformers models using `tensorflow` do not support structured
                generation.
                """,
                DeprecationWarning,
                stacklevel=2,
            )
        else:
            self.tensor_library_name = "torch"

    def _prepare_model_inputs(
        self,
        model_input,
        is_batch: bool = False,
    ) -> Tuple[Union[str, List[str]], dict]:
        """Turn the user input into arguments to pass to the model"""
        # Format validation
        if is_batch:
            prompts = [
                self.type_adapter.format_input(item)
                for item in model_input
            ]
        else:
            prompts = self.type_adapter.format_input(model_input)
        input_ids, attention_mask = self.tokenizer.encode(prompts)
        inputs = {
            "input_ids": input_ids.to(self.model.device),
            "attention_mask": (
                attention_mask.to(self.model.device, dtype=self.device_dtype)
                if self.device_dtype is not None
                else attention_mask.to(self.model.device)
            ),
        }

        return prompts, inputs

    def generate(
        self,
        model_input: Union[str, dict, Chat],
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **inference_kwargs: Any,
    ) -> Union[str, List[str]]:
        """Generate text using `transformers`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response. For
            multi-modal models, the input should be a dictionary containing the
            `text` key with a value of type `Union[str, List[str]]` and the
            other keys required by the model.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        inference_kwargs
            Additional keyword arguments to pass to the `generate` method
            of the `transformers` model.

        Returns
        -------
        Union[str, List[str]]
            The text generated by the model.

        """
        prompts, inputs = self._prepare_model_inputs(model_input, False)
        logits_processor = self.type_adapter.format_output_type(output_type)

        generated_ids = self._generate_output_seq(
            prompts,
            inputs,
            logits_processor=logits_processor,
            **inference_kwargs,
        )

        # required for multi-modal models that return a 2D tensor even when
        # num_return_sequences is 1
        num_samples = inference_kwargs.get("num_return_sequences", 1)
        if num_samples == 1 and len(generated_ids.shape) == 2:
            generated_ids = generated_ids.squeeze(0)

        return self._decode_generation(generated_ids)

    def generate_batch(
        self,
        model_input: List[Union[str, dict, Chat]],
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **inference_kwargs: Any,
    ) -> List[Union[str, List[str]]]:
        """"""
        prompts, inputs = self._prepare_model_inputs(model_input, True) # type: ignore
        logits_processor = self.type_adapter.format_output_type(output_type)

        generated_ids = self._generate_output_seq(
            prompts, inputs, logits_processor=logits_processor, **inference_kwargs
        )

        # if there are multiple samples per input, convert generated_id to 3D
        num_samples = inference_kwargs.get("num_return_sequences", 1)
        if num_samples > 1:
            generated_ids = generated_ids.view(len(model_input), num_samples, -1)

        return self._decode_generation(generated_ids)

    def generate_stream(self, model_input, output_type, **inference_kwargs):
        """Not available for `transformers` models.

        TODO: implement following completion of https://github.com/huggingface/transformers/issues/30810

        """
        raise NotImplementedError(
            "Streaming is not implemented for Transformers models."
        )

    def _generate_output_seq(self, prompts, inputs, **inference_kwargs):
        input_ids = inputs["input_ids"]

        output_ids = self.model.generate(
            **inputs,
            **inference_kwargs,
        )

        # encoder-decoder returns output_ids only, decoder-only returns full seq ids
        if self.model.config.is_encoder_decoder:
            generated_ids = output_ids
        else:
            generated_ids = output_ids[:, input_ids.shape[1] :]

        return generated_ids

    def _decode_generation(self, generated_ids: "torch.Tensor"):
        if len(generated_ids.shape) == 1:
            return self.tokenizer.decode([generated_ids])[0]
        elif len(generated_ids.shape) == 2:
            return self.tokenizer.decode(generated_ids)
        elif len(generated_ids.shape) == 3:
            return [
                self.tokenizer.decode(generated_ids[i])
                for i in range(len(generated_ids))
            ]
        else:  # pragma: no cover
            raise TypeError(
                "Generated outputs aren't 1D, 2D or 3D, but instead are "
                f"{generated_ids.shape}"
            )

__init__(model, tokenizer, *, device_dtype=None)

Parameters:

model A PreTrainedModel, or any model that is compatible with the transformers API for models. tokenizer A PreTrainedTokenizer, or any tokenizer that is compatible with the transformers API for tokenizers. device_dtype The dtype to use for the model. If not provided, the model will use the default dtype.

Source code in outlines/models/transformers.py
def __init__(
    self,
    model: "PreTrainedModel",
    tokenizer: "PreTrainedTokenizer",
    *,
    device_dtype: Optional["torch.dtype"] = None,
):
    """
    Parameters:
    ----------
    model
        A `PreTrainedModel`, or any model that is compatible with the
        `transformers` API for models.
    tokenizer
        A `PreTrainedTokenizer`, or any tokenizer that is compatible with
        the `transformers` API for tokenizers.
    device_dtype
        The dtype to use for the model. If not provided, the model will use
        the default dtype.

    """
    # We need to handle the cases in which jax/flax or tensorflow
    # is not available in the environment.
    try:
        from transformers import FlaxPreTrainedModel
    except ImportError:  # pragma: no cover
        FlaxPreTrainedModel = None

    try:
        from transformers import TFPreTrainedModel
    except ImportError:  # pragma: no cover
        TFPreTrainedModel = None

    tokenizer.padding_side = "left"
    self.model = model
    self.hf_tokenizer = tokenizer
    self.tokenizer = TransformerTokenizer(tokenizer)
    self.device_dtype = device_dtype
    self.type_adapter = TransformersTypeAdapter(
        tokenizer=tokenizer,
        has_chat_template=_check_hf_chat_template(tokenizer)
    )

    if (
        FlaxPreTrainedModel is not None
        and isinstance(model, FlaxPreTrainedModel)
    ):  # pragma: no cover
        self.tensor_library_name = "jax"
        warnings.warn("""
            Support for `jax` has been deprecated and will be removed in
            version 1.4.0 of Outlines. Please use `torch` instead.
            Transformers models using `jax` do not support structured
            generation.
            """,
            DeprecationWarning,
            stacklevel=2,
        )
    elif (
        TFPreTrainedModel is not None
        and isinstance(model, TFPreTrainedModel)
    ):  # pragma: no cover
        self.tensor_library_name = "tensorflow"
        warnings.warn("""
            Support for `tensorflow` has been deprecated and will be removed in
            version 1.4.0 of Outlines. Please use `torch` instead.
            Transformers models using `tensorflow` do not support structured
            generation.
            """,
            DeprecationWarning,
            stacklevel=2,
        )
    else:
        self.tensor_library_name = "torch"

generate(model_input, output_type=None, **inference_kwargs)

Generate text using transformers.

Parameters:

Name Type Description Default
model_input Union[str, dict, Chat]

The prompt based on which the model will generate a response. For multi-modal models, the input should be a dictionary containing the text key with a value of type Union[str, List[str]] and the other keys required by the model.

required
output_type Optional[OutlinesLogitsProcessor]

The logits processor the model will use to constrain the format of the generated text.

None
inference_kwargs Any

Additional keyword arguments to pass to the generate method of the transformers model.

{}

Returns:

Type Description
Union[str, List[str]]

The text generated by the model.

Source code in outlines/models/transformers.py
def generate(
    self,
    model_input: Union[str, dict, Chat],
    output_type: Optional[OutlinesLogitsProcessor] = None,
    **inference_kwargs: Any,
) -> Union[str, List[str]]:
    """Generate text using `transformers`.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response. For
        multi-modal models, the input should be a dictionary containing the
        `text` key with a value of type `Union[str, List[str]]` and the
        other keys required by the model.
    output_type
        The logits processor the model will use to constrain the format of
        the generated text.
    inference_kwargs
        Additional keyword arguments to pass to the `generate` method
        of the `transformers` model.

    Returns
    -------
    Union[str, List[str]]
        The text generated by the model.

    """
    prompts, inputs = self._prepare_model_inputs(model_input, False)
    logits_processor = self.type_adapter.format_output_type(output_type)

    generated_ids = self._generate_output_seq(
        prompts,
        inputs,
        logits_processor=logits_processor,
        **inference_kwargs,
    )

    # required for multi-modal models that return a 2D tensor even when
    # num_return_sequences is 1
    num_samples = inference_kwargs.get("num_return_sequences", 1)
    if num_samples == 1 and len(generated_ids.shape) == 2:
        generated_ids = generated_ids.squeeze(0)

    return self._decode_generation(generated_ids)

generate_batch(model_input, output_type=None, **inference_kwargs)

Source code in outlines/models/transformers.py
def generate_batch(
    self,
    model_input: List[Union[str, dict, Chat]],
    output_type: Optional[OutlinesLogitsProcessor] = None,
    **inference_kwargs: Any,
) -> List[Union[str, List[str]]]:
    """"""
    prompts, inputs = self._prepare_model_inputs(model_input, True) # type: ignore
    logits_processor = self.type_adapter.format_output_type(output_type)

    generated_ids = self._generate_output_seq(
        prompts, inputs, logits_processor=logits_processor, **inference_kwargs
    )

    # if there are multiple samples per input, convert generated_id to 3D
    num_samples = inference_kwargs.get("num_return_sequences", 1)
    if num_samples > 1:
        generated_ids = generated_ids.view(len(model_input), num_samples, -1)

    return self._decode_generation(generated_ids)

generate_stream(model_input, output_type, **inference_kwargs)

Not available for transformers models.

TODO: implement following completion of https://github.com/huggingface/transformers/issues/30810

Source code in outlines/models/transformers.py
def generate_stream(self, model_input, output_type, **inference_kwargs):
    """Not available for `transformers` models.

    TODO: implement following completion of https://github.com/huggingface/transformers/issues/30810

    """
    raise NotImplementedError(
        "Streaming is not implemented for Transformers models."
    )

TransformersMultiModal

Bases: Transformers

Thin wrapper around a transformers model and a transformers processor.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the transformers model and processor.

Source code in outlines/models/transformers.py
class TransformersMultiModal(Transformers):
    """Thin wrapper around a `transformers` model and a `transformers`
    processor.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `transformers` model and
    processor.

    """

    def __init__(
        self,
        model: "PreTrainedModel",
        processor,
        *,
        device_dtype: Optional["torch.dtype"] = None,
    ):
        """Create a TransformersMultiModal model instance

        We rely on the `__init__` method of the `Transformers` class to handle
        most of the initialization and then add elements specific to multimodal
        models.

        Parameters
        ----------
        model
            A `PreTrainedModel`, or any model that is compatible with the
            `transformers` API for models.
        processor
            A `ProcessorMixin` instance.
        device_dtype
            The dtype to use for the model. If not provided, the model will use
            the default dtype.

        """
        self.processor = processor
        self.processor.padding_side = "left"
        self.processor.pad_token = "[PAD]"

        tokenizer: "PreTrainedTokenizer" = self.processor.tokenizer

        super().__init__(model, tokenizer, device_dtype=device_dtype)

        self.type_adapter = TransformersMultiModalTypeAdapter(
            tokenizer=tokenizer
        )

    def _prepare_model_inputs(
        self,
        model_input,
        is_batch: bool = False,
    ) -> Tuple[Union[str, List[str]], dict]:
        """Turn the user input into arguments to pass to the model"""
        if is_batch:
            prompts = [
                self.type_adapter.format_input(item) for item in model_input
            ]
        else:
            prompts = self.type_adapter.format_input(model_input)

        # The expected format is a single dict
        if is_batch:
            merged_prompts = defaultdict(list)
            for d in prompts:
                for key, value in d.items():
                    if key == "text":
                        merged_prompts[key].append(value)
                    else:
                        merged_prompts[key].extend(value)
        else:
            merged_prompts = prompts # type: ignore

        inputs = self.processor(
            **merged_prompts, padding=True, return_tensors="pt"
        )
        if self.device_dtype is not None:
            inputs = inputs.to(self.model.device, dtype=self.device_dtype)
        else:
            inputs = inputs.to(self.model.device)

        return merged_prompts["text"], inputs

__init__(model, processor, *, device_dtype=None)

Create a TransformersMultiModal model instance

We rely on the __init__ method of the Transformers class to handle most of the initialization and then add elements specific to multimodal models.

Parameters:

Name Type Description Default
model PreTrainedModel

A PreTrainedModel, or any model that is compatible with the transformers API for models.

required
processor

A ProcessorMixin instance.

required
device_dtype Optional[dtype]

The dtype to use for the model. If not provided, the model will use the default dtype.

None
Source code in outlines/models/transformers.py
def __init__(
    self,
    model: "PreTrainedModel",
    processor,
    *,
    device_dtype: Optional["torch.dtype"] = None,
):
    """Create a TransformersMultiModal model instance

    We rely on the `__init__` method of the `Transformers` class to handle
    most of the initialization and then add elements specific to multimodal
    models.

    Parameters
    ----------
    model
        A `PreTrainedModel`, or any model that is compatible with the
        `transformers` API for models.
    processor
        A `ProcessorMixin` instance.
    device_dtype
        The dtype to use for the model. If not provided, the model will use
        the default dtype.

    """
    self.processor = processor
    self.processor.padding_side = "left"
    self.processor.pad_token = "[PAD]"

    tokenizer: "PreTrainedTokenizer" = self.processor.tokenizer

    super().__init__(model, tokenizer, device_dtype=device_dtype)

    self.type_adapter = TransformersMultiModalTypeAdapter(
        tokenizer=tokenizer
    )

VLLM

Bases: Model

Thin wrapper around the openai.OpenAI client used to communicate with a vllm server.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the openai.OpenAI client for the vllm server.

Source code in outlines/models/vllm.py
class VLLM(Model):
    """Thin wrapper around the `openai.OpenAI` client used to communicate with
    a `vllm` server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.OpenAI` client for the
    `vllm` server.
    """

    def __init__(
        self,
        client: "OpenAI",
        model_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            An `openai.OpenAI` client instance.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = VLLMTypeAdapter()

    def generate(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using vLLM.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input,
            output_type,
            **inference_kwargs,
        )

        response = self.client.chat.completions.create(**client_args)

        messages = [choice.message for choice in response.choices]
        for message in messages:
            if message.refusal is not None:  # pragma: no cover
                raise ValueError(
                    f"The vLLM server refused to answer the request: "
                    f"{message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError("VLLM does not support batch inference.")

    def generate_stream(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using vLLM.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = self.client.chat.completions.create(
            **client_args, stream=True,
        )

        for chunk in stream:  # pragma: no cover
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

    def _build_client_args(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the OpenAI client."""
        messages = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        extra_body = inference_kwargs.pop("extra_body", {})
        extra_body.update(output_type_args)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        client_args = {
            "messages": messages,
            **inference_kwargs,
        }
        if extra_body:
            client_args["extra_body"] = extra_body

        return client_args

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client OpenAI

An openai.OpenAI client instance.

required
Source code in outlines/models/vllm.py
def __init__(
    self,
    client: "OpenAI",
    model_name: Optional[str] = None,
):
    """
    Parameters
    ----------
    client
        An `openai.OpenAI` client instance.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = VLLMTypeAdapter()

generate(model_input, output_type=None, **inference_kwargs)

Generate text using vLLM.

Parameters:

Name Type Description Default
model_input Union[Chat, str, list]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Union[str, list[str]]

The text generated by the model.

Source code in outlines/models/vllm.py
def generate(
    self,
    model_input: Union[Chat, str, list],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Union[str, list[str]]:
    """Generate text using vLLM.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Union[str, list[str]]
        The text generated by the model.

    """
    client_args = self._build_client_args(
        model_input,
        output_type,
        **inference_kwargs,
    )

    response = self.client.chat.completions.create(**client_args)

    messages = [choice.message for choice in response.choices]
    for message in messages:
        if message.refusal is not None:  # pragma: no cover
            raise ValueError(
                f"The vLLM server refused to answer the request: "
                f"{message.refusal}"
            )

    if len(messages) == 1:
        return messages[0].content
    else:
        return [message.content for message in messages]

generate_stream(model_input, output_type=None, **inference_kwargs)

Stream text using vLLM.

Parameters:

Name Type Description Default
model_input Union[Chat, str, list]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/vllm.py
def generate_stream(
    self,
    model_input: Union[Chat, str, list],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Iterator[str]:
    """Stream text using vLLM.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    stream = self.client.chat.completions.create(
        **client_args, stream=True,
    )

    for chunk in stream:  # pragma: no cover
        if chunk.choices and chunk.choices[0].delta.content is not None:
            yield chunk.choices[0].delta.content

VLLMOffline

Bases: Model

Thin wrapper around a vllm.LLM model.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the vllm.LLM model.

Source code in outlines/models/vllm_offline.py
class VLLMOffline(Model):
    """Thin wrapper around a `vllm.LLM` model.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `vllm.LLM` model.

    """

    def __init__(self, model: "LLM"):
        """Create a VLLM model instance.

        Parameters
        ----------
        model
            A `vllm.LLM` model instance.

        """
        self.model = model
        self.tokenizer = self.model.get_tokenizer()
        self.type_adapter = VLLMOfflineTypeAdapter(has_chat_template=self._check_chat_template())

    def _build_generation_args(
        self,
        inference_kwargs: dict,
        output_type: Optional[Any] = None,
    ) -> "SamplingParams":
        """Create the `SamplingParams` object to pass to the `generate` method
        of the `vllm.LLM` model."""
        from vllm.sampling_params import StructuredOutputsParams, SamplingParams

        sampling_params = inference_kwargs.pop("sampling_params", None)

        if sampling_params is None:
            sampling_params = SamplingParams()

        output_type_args = self.type_adapter.format_output_type(output_type)
        if output_type_args:
            original_sampling_params_dict = {f: getattr(sampling_params, f) for f in sampling_params.__struct_fields__}
            sampling_params_dict = {**original_sampling_params_dict, "structured_outputs": StructuredOutputsParams(**output_type_args)}
            sampling_params = SamplingParams(**sampling_params_dict)

        return sampling_params

    def generate(
        self,
        model_input: Chat | str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, List[str]]:
        """Generate text using vLLM offline.

        Parameters
        ----------
        prompt
            The prompt based on which the model will generate a response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        inference_kwargs
            Additional keyword arguments to pass to the `generate` method
            in the `vllm.LLM` model.

        Returns
        -------
        Union[str, List[str]]
            The text generated by the model.

        """
        sampling_params = self._build_generation_args(
            inference_kwargs,
            output_type,
        )

        model_input = self.type_adapter.format_input(model_input)

        if isinstance(model_input, list):
            results = self.model.chat(
                messages=model_input,
                sampling_params=sampling_params,
                **inference_kwargs,
            )
        else:
            results = self.model.generate(
                prompts=model_input,
                sampling_params=sampling_params,
                **inference_kwargs,
            )
        results = [completion.text for completion in results[0].outputs]

        if len(results) == 1:
            return results[0]
        else:
            return results

    def generate_batch(
        self,
        model_input: List[Chat | str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[List[str], List[List[str]]]:
        """Generate a batch of completions using vLLM offline.

        Parameters
        ----------
        prompt
            The list of prompts based on which the model will generate a
            response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        inference_kwargs
            Additional keyword arguments to pass to the `generate` method
            in the `vllm.LLM` model.

        Returns
        -------
        Union[List[str], List[List[str]]]
            The text generated by the model.

        """
        sampling_params = self._build_generation_args(
            inference_kwargs,
            output_type,
        )

        model_inputs = [self.type_adapter.format_input(item) for item in model_input]

        if model_inputs and isinstance(model_inputs[0], list):
            results = self.model.chat(
                messages=model_inputs,
                sampling_params=sampling_params,
                **inference_kwargs,
            )
        else:
            results = self.model.generate(
                prompts=model_inputs,
                sampling_params=sampling_params,
                **inference_kwargs,
            )
        return [[sample.text for sample in batch.outputs] for batch in results]

    def generate_stream(self, model_input, output_type, **inference_kwargs):
        """Not available for `vllm.LLM`.

        TODO: Implement the streaming functionality ourselves.

        """
        raise NotImplementedError(
            "Streaming is not available for the vLLM offline integration."
        )

    def _check_chat_template(self) -> bool:
        """Check if the tokenizer has a chat template."""
        from vllm.transformers_utils.tokenizer import (
            PreTrainedTokenizer,
            PreTrainedTokenizerFast,
            TokenizerBase
        )
        from outlines.models.tokenizer import _check_hf_chat_template

        if isinstance(self.tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
            return _check_hf_chat_template(self.tokenizer)
        elif isinstance(self.tokenizer, TokenizerBase):
            # vLLM defines its own TokenizerBase class, and only provides
            # limited compatibility with HuggingFace tokenizers. So we
            # need to check for chat template support differently.
            try:
                self.tokenizer.apply_chat_template([{"role": "user", "content": "test"}])
                return True
            except Exception:
                return False
        else:  # Never reached  # pragma: no cover
            return False

__init__(model)

Create a VLLM model instance.

Parameters:

Name Type Description Default
model LLM

A vllm.LLM model instance.

required
Source code in outlines/models/vllm_offline.py
def __init__(self, model: "LLM"):
    """Create a VLLM model instance.

    Parameters
    ----------
    model
        A `vllm.LLM` model instance.

    """
    self.model = model
    self.tokenizer = self.model.get_tokenizer()
    self.type_adapter = VLLMOfflineTypeAdapter(has_chat_template=self._check_chat_template())

generate(model_input, output_type=None, **inference_kwargs)

Generate text using vLLM offline.

Parameters:

Name Type Description Default
prompt

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The logits processor the model will use to constrain the format of the generated text.

None
inference_kwargs Any

Additional keyword arguments to pass to the generate method in the vllm.LLM model.

{}

Returns:

Type Description
Union[str, List[str]]

The text generated by the model.

Source code in outlines/models/vllm_offline.py
def generate(
    self,
    model_input: Chat | str,
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Union[str, List[str]]:
    """Generate text using vLLM offline.

    Parameters
    ----------
    prompt
        The prompt based on which the model will generate a response.
    output_type
        The logits processor the model will use to constrain the format of
        the generated text.
    inference_kwargs
        Additional keyword arguments to pass to the `generate` method
        in the `vllm.LLM` model.

    Returns
    -------
    Union[str, List[str]]
        The text generated by the model.

    """
    sampling_params = self._build_generation_args(
        inference_kwargs,
        output_type,
    )

    model_input = self.type_adapter.format_input(model_input)

    if isinstance(model_input, list):
        results = self.model.chat(
            messages=model_input,
            sampling_params=sampling_params,
            **inference_kwargs,
        )
    else:
        results = self.model.generate(
            prompts=model_input,
            sampling_params=sampling_params,
            **inference_kwargs,
        )
    results = [completion.text for completion in results[0].outputs]

    if len(results) == 1:
        return results[0]
    else:
        return results

generate_batch(model_input, output_type=None, **inference_kwargs)

Generate a batch of completions using vLLM offline.

Parameters:

Name Type Description Default
prompt

The list of prompts based on which the model will generate a response.

required
output_type Optional[Any]

The logits processor the model will use to constrain the format of the generated text.

None
inference_kwargs Any

Additional keyword arguments to pass to the generate method in the vllm.LLM model.

{}

Returns:

Type Description
Union[List[str], List[List[str]]]

The text generated by the model.

Source code in outlines/models/vllm_offline.py
def generate_batch(
    self,
    model_input: List[Chat | str],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Union[List[str], List[List[str]]]:
    """Generate a batch of completions using vLLM offline.

    Parameters
    ----------
    prompt
        The list of prompts based on which the model will generate a
        response.
    output_type
        The logits processor the model will use to constrain the format of
        the generated text.
    inference_kwargs
        Additional keyword arguments to pass to the `generate` method
        in the `vllm.LLM` model.

    Returns
    -------
    Union[List[str], List[List[str]]]
        The text generated by the model.

    """
    sampling_params = self._build_generation_args(
        inference_kwargs,
        output_type,
    )

    model_inputs = [self.type_adapter.format_input(item) for item in model_input]

    if model_inputs and isinstance(model_inputs[0], list):
        results = self.model.chat(
            messages=model_inputs,
            sampling_params=sampling_params,
            **inference_kwargs,
        )
    else:
        results = self.model.generate(
            prompts=model_inputs,
            sampling_params=sampling_params,
            **inference_kwargs,
        )
    return [[sample.text for sample in batch.outputs] for batch in results]

generate_stream(model_input, output_type, **inference_kwargs)

Not available for vllm.LLM.

TODO: Implement the streaming functionality ourselves.

Source code in outlines/models/vllm_offline.py
def generate_stream(self, model_input, output_type, **inference_kwargs):
    """Not available for `vllm.LLM`.

    TODO: Implement the streaming functionality ourselves.

    """
    raise NotImplementedError(
        "Streaming is not available for the vLLM offline integration."
    )

from_anthropic(client, model_name=None)

Create an Outlines Anthropic model instance from an anthropic.Anthropic client instance.

Parameters:

Name Type Description Default
client Anthropic

An anthropic.Anthropic client instance.

required
model_name Optional[str]

The name of the model to use.

None

Returns:

Type Description
Anthropic

An Outlines Anthropic model instance.

Source code in outlines/models/anthropic.py
def from_anthropic(
    client: "AnthropicClient", model_name: Optional[str] = None
) -> Anthropic:
    """Create an Outlines `Anthropic` model instance from an
    `anthropic.Anthropic` client instance.

    Parameters
    ----------
    client
        An `anthropic.Anthropic` client instance.
    model_name
        The name of the model to use.

    Returns
    -------
    Anthropic
        An Outlines `Anthropic` model instance.

    """
    return Anthropic(client, model_name)

from_dottxt(client, model_name=None, model_revision=None)

Create an Outlines Dottxt model instance from a dottxt.Dottxt client instance.

Parameters:

Name Type Description Default
client Dottxt

A dottxt.Dottxt client instance.

required
model_name Optional[str]

The name of the model to use.

None
model_revision Optional[str]

The revision of the model to use.

None

Returns:

Type Description
Dottxt

An Outlines Dottxt model instance.

Source code in outlines/models/dottxt.py
def from_dottxt(
    client: "DottxtClient",
    model_name: Optional[str] = None,
    model_revision: Optional[str] = None,
) -> Dottxt:
    """Create an Outlines `Dottxt` model instance from a `dottxt.Dottxt`
    client instance.

    Parameters
    ----------
    client
        A `dottxt.Dottxt` client instance.
    model_name
        The name of the model to use.
    model_revision
        The revision of the model to use.

    Returns
    -------
    Dottxt
        An Outlines `Dottxt` model instance.

    """
    return Dottxt(client, model_name, model_revision)

from_gemini(client, model_name=None)

Create an Outlines Gemini model instance from a google.genai.Client instance.

Parameters:

Name Type Description Default
client Client

A google.genai.Client instance.

required
model_name Optional[str]

The name of the model to use.

None

Returns:

Type Description
Gemini

An Outlines Gemini model instance.

Source code in outlines/models/gemini.py
def from_gemini(client: "Client", model_name: Optional[str] = None) -> Gemini:
    """Create an Outlines `Gemini` model instance from a
    `google.genai.Client` instance.

    Parameters
    ----------
    client
        A `google.genai.Client` instance.
    model_name
        The name of the model to use.

    Returns
    -------
    Gemini
        An Outlines `Gemini` model instance.

    """
    return Gemini(client, model_name)

from_llamacpp(model, chat_mode=True)

Create an Outlines LlamaCpp model instance from a llama_cpp.Llama instance.

Parameters:

Name Type Description Default
model Llama

A llama_cpp.Llama instance.

required
chat_mode bool

Whether to enable chat mode. If False, the model will regard all str inputs as plain text prompts. If True, the model will regard all str inputs as user messages in a chat conversation.

True

Returns:

Type Description
LlamaCpp

An Outlines LlamaCpp model instance.

Source code in outlines/models/llamacpp.py
def from_llamacpp(model: "Llama", chat_mode: bool = True) -> LlamaCpp:
    """Create an Outlines `LlamaCpp` model instance from a
    `llama_cpp.Llama` instance.

    Parameters
    ----------
    model
        A `llama_cpp.Llama` instance.
    chat_mode
        Whether to enable chat mode. If `False`, the model will regard
        all `str` inputs as plain text prompts. If `True`, the model will
        regard all `str` inputs as user messages in a chat conversation.

    Returns
    -------
    LlamaCpp
        An Outlines `LlamaCpp` model instance.

    """
    return LlamaCpp(model, chat_mode=chat_mode)

from_lmstudio(client, model_name=None)

Create an Outlines LMStudio model instance from a lmstudio.Client or lmstudio.AsyncClient instance.

Parameters:

Name Type Description Default
client Union[Client, AsyncClient]

A lmstudio.Client or lmstudio.AsyncClient instance.

required
model_name Optional[str]

The name of the model to use.

None

Returns:

Type Description
Union[LMStudio, AsyncLMStudio]

An Outlines LMStudio or AsyncLMStudio model instance.

Source code in outlines/models/lmstudio.py
def from_lmstudio(
    client: Union["Client", "AsyncClient"],
    model_name: Optional[str] = None,
) -> Union[LMStudio, AsyncLMStudio]:
    """Create an Outlines `LMStudio` model instance from a
    `lmstudio.Client` or `lmstudio.AsyncClient` instance.

    Parameters
    ----------
    client
        A `lmstudio.Client` or `lmstudio.AsyncClient` instance.
    model_name
        The name of the model to use.

    Returns
    -------
    Union[LMStudio, AsyncLMStudio]
        An Outlines `LMStudio` or `AsyncLMStudio` model instance.

    """
    from lmstudio import AsyncClient, Client

    if isinstance(client, Client):
        return LMStudio(client, model_name)
    elif isinstance(client, AsyncClient):
        return AsyncLMStudio(client, model_name)
    else:
        raise ValueError(
            "Invalid client type, the client must be an instance of "
            "`lmstudio.Client` or `lmstudio.AsyncClient`."
        )

from_mistral(client, model_name=None, async_client=False)

Create an Outlines Mistral model instance from a mistralai.Mistral client.

Parameters:

Name Type Description Default
client Mistral

A mistralai.Mistral client instance.

required
model_name Optional[str]

The name of the model to use.

None
async_client bool

If True, return an AsyncMistral instance; otherwise, return a Mistral instance.

False

Returns:

Type Description
Union[Mistral, AsyncMistral]

An Outlines Mistral or AsyncMistral model instance.

Source code in outlines/models/mistral.py
def from_mistral(
    client: "MistralClient",
    model_name: Optional[str] = None,
    async_client: bool = False,
) -> Union[Mistral, AsyncMistral]:
    """Create an Outlines Mistral model instance from a mistralai.Mistral
    client.

    Parameters
    ----------
    client : MistralClient
        A mistralai.Mistral client instance.
    model_name : Optional[str]
        The name of the model to use.
    async_client : bool
        If True, return an AsyncMistral instance;
        otherwise, return a Mistral instance.

    Returns
    -------
    Union[Mistral, AsyncMistral]
        An Outlines Mistral or AsyncMistral model instance.

    """
    from mistralai import Mistral as MistralClient

    if not isinstance(client, MistralClient):
        raise ValueError(
            "Invalid client type. The client must be an instance of "
            "`mistralai.Mistral`."
        )

    if async_client:
        return AsyncMistral(client, model_name)
    else:
        return Mistral(client, model_name)

from_mlxlm(model, tokenizer)

Create an Outlines MLXLM model instance from an mlx_lm model and a tokenizer.

Parameters:

Name Type Description Default
model Module

An instance of an mlx_lm model.

required
tokenizer PreTrainedTokenizer

An instance of an mlx_lm tokenizer or of a compatible transformers tokenizer.

required

Returns:

Type Description
MLXLM

An Outlines MLXLM model instance.

Source code in outlines/models/mlxlm.py
def from_mlxlm(model: "nn.Module", tokenizer: "PreTrainedTokenizer") -> MLXLM:
    """Create an Outlines `MLXLM` model instance from an `mlx_lm` model and a
    tokenizer.

    Parameters
    ----------
    model
        An instance of an `mlx_lm` model.
    tokenizer
        An instance of an `mlx_lm` tokenizer or of a compatible
        transformers tokenizer.

    Returns
    -------
    MLXLM
        An Outlines `MLXLM` model instance.

    """
    return MLXLM(model, tokenizer)

from_ollama(client, model_name=None)

Create an Outlines Ollama model instance from an ollama.Client or ollama.AsyncClient instance.

Parameters:

Name Type Description Default
client Union[Client, AsyncClient]

A ollama.Client or ollama.AsyncClient instance.

required
model_name Optional[str]

The name of the model to use.

None

Returns:

Type Description
Union[Ollama, AsyncOllama]

An Outlines Ollama or AsyncOllama model instance.

Source code in outlines/models/ollama.py
def from_ollama(
    client: Union["Client", "AsyncClient"], model_name: Optional[str] = None
) -> Union[Ollama, AsyncOllama]:
    """Create an Outlines `Ollama` model instance from an `ollama.Client`
    or `ollama.AsyncClient` instance.

    Parameters
    ----------
    client
        A `ollama.Client` or `ollama.AsyncClient` instance.
    model_name
        The name of the model to use.

    Returns
    -------
    Union[Ollama, AsyncOllama]
        An Outlines `Ollama` or `AsyncOllama` model instance.

    """
    from ollama import AsyncClient, Client

    if isinstance(client, Client):
        return Ollama(client, model_name)
    elif isinstance(client, AsyncClient):
        return AsyncOllama(client, model_name)
    else:
        raise ValueError(
            "Invalid client type, the client must be an instance of "
            "`ollama.Client` or `ollama.AsyncClient`."
        )

from_openai(client, model_name=None)

Create an Outlines OpenAI or AsyncOpenAI model instance from an openai.OpenAI or openai.AsyncOpenAI client.

Parameters:

Name Type Description Default
client Union[OpenAI, AsyncOpenAI, AzureOpenAI, AsyncAzureOpenAI]

An openai.OpenAI, openai.AsyncOpenAI, openai.AzureOpenAI or openai.AsyncAzureOpenAI client instance.

required
model_name Optional[str]

The name of the model to use.

None

Returns:

Type Description
OpenAI

An Outlines OpenAI or AsyncOpenAI model instance.

Source code in outlines/models/openai.py
def from_openai(
    client: Union[
        "OpenAIClient",
        "AsyncOpenAIClient",
        "AzureOpenAIClient",
        "AsyncAzureOpenAIClient",
    ],
    model_name: Optional[str] = None,
) -> Union[OpenAI, AsyncOpenAI]:
    """Create an Outlines `OpenAI` or `AsyncOpenAI` model instance from an
    `openai.OpenAI` or `openai.AsyncOpenAI` client.

    Parameters
    ----------
    client
        An `openai.OpenAI`, `openai.AsyncOpenAI`, `openai.AzureOpenAI` or
        `openai.AsyncAzureOpenAI` client instance.
    model_name
        The name of the model to use.

    Returns
    -------
    OpenAI
        An Outlines `OpenAI` or `AsyncOpenAI` model instance.

    """
    import openai

    if isinstance(client, openai.OpenAI):
        return OpenAI(client, model_name)
    elif isinstance(client, openai.AsyncOpenAI):
        return AsyncOpenAI(client, model_name)
    else:
        raise ValueError(
            "Invalid client type. The client must be an instance of "
            "+ `openai.OpenAI` or `openai.AsyncOpenAI`."
        )

from_sglang(client, model_name=None)

Create a SGLang or AsyncSGLang instance from an openai.OpenAI or openai.AsyncOpenAI instance.

Parameters:

Name Type Description Default
client Union[OpenAI, AsyncOpenAI]

An openai.OpenAI or openai.AsyncOpenAI instance.

required
model_name Optional[str]

The name of the model to use.

None

Returns:

Type Description
Union[SGLang, AsyncSGLang]

An Outlines SGLang or AsyncSGLang model instance.

Source code in outlines/models/sglang.py
def from_sglang(
    client: Union["OpenAI", "AsyncOpenAI"],
    model_name: Optional[str] = None,
) -> Union[SGLang, AsyncSGLang]:
    """Create a `SGLang` or `AsyncSGLang` instance from an `openai.OpenAI` or
    `openai.AsyncOpenAI` instance.

    Parameters
    ----------
    client
        An `openai.OpenAI` or `openai.AsyncOpenAI` instance.
    model_name
        The name of the model to use.

    Returns
    -------
    Union[SGLang, AsyncSGLang]
        An Outlines `SGLang` or `AsyncSGLang` model instance.

    """
    from openai import AsyncOpenAI, OpenAI

    if isinstance(client, OpenAI):
        return SGLang(client, model_name)
    elif isinstance(client, AsyncOpenAI):
        return AsyncSGLang(client, model_name)
    else:
        raise ValueError(
            f"Unsupported client type: {type(client)}.\n"
            "Please provide an OpenAI or AsyncOpenAI instance."
        )

from_tgi(client)

Create an Outlines TGI or AsyncTGI model instance from an huggingface_hub.InferenceClient or huggingface_hub.AsyncInferenceClient instance.

Parameters:

Name Type Description Default
client Union[InferenceClient, AsyncInferenceClient]

An huggingface_hub.InferenceClient or huggingface_hub.AsyncInferenceClient instance.

required

Returns:

Type Description
Union[TGI, AsyncTGI]

An Outlines TGI or AsyncTGI model instance.

Source code in outlines/models/tgi.py
def from_tgi(
    client: Union["InferenceClient", "AsyncInferenceClient"],
) -> Union[TGI, AsyncTGI]:
    """Create an Outlines `TGI` or `AsyncTGI` model instance from an
    `huggingface_hub.InferenceClient` or `huggingface_hub.AsyncInferenceClient`
    instance.

    Parameters
    ----------
    client
        An `huggingface_hub.InferenceClient` or
        `huggingface_hub.AsyncInferenceClient` instance.

    Returns
    -------
    Union[TGI, AsyncTGI]
        An Outlines `TGI` or `AsyncTGI` model instance.

    """
    from huggingface_hub import AsyncInferenceClient, InferenceClient

    if isinstance(client, InferenceClient):
        return TGI(client)
    elif isinstance(client, AsyncInferenceClient):
        return AsyncTGI(client)
    else:
        raise ValueError(
            f"Unsupported client type: {type(client)}.\n"
            + "Please provide an HuggingFace InferenceClient "
            + "or AsyncInferenceClient instance."
        )

from_transformers(model, tokenizer_or_processor, *, device_dtype=None)

Create an Outlines Transformers or TransformersMultiModal model instance from a PreTrainedModel instance and a PreTrainedTokenizer or ProcessorMixin instance.

outlines supports PreTrainedModelForCausalLM, PreTrainedMambaForCausalLM, PreTrainedModelForSeq2Seq and any model that implements the transformers model API.

Parameters:

Name Type Description Default
model PreTrainedModel

A transformers.PreTrainedModel instance.

required
tokenizer_or_processor Union[PreTrainedTokenizer, ProcessorMixin]

A transformers.PreTrainedTokenizer or transformers.ProcessorMixin instance.

required
device_dtype Optional[dtype]

The dtype to use for the model. If not provided, the model will use the default dtype.

None

Returns:

Type Description
Union[Transformers, TransformersMultiModal]

An Outlines Transformers or TransformersMultiModal model instance.

Source code in outlines/models/transformers.py
def from_transformers(
    model: "PreTrainedModel",
    tokenizer_or_processor: Union["PreTrainedTokenizer", "ProcessorMixin"],
    *,
    device_dtype: Optional["torch.dtype"] = None,
) -> Union[Transformers, TransformersMultiModal]:
    """Create an Outlines `Transformers` or `TransformersMultiModal` model
    instance from a `PreTrainedModel` instance and a `PreTrainedTokenizer` or
    `ProcessorMixin` instance.

    `outlines` supports `PreTrainedModelForCausalLM`,
    `PreTrainedMambaForCausalLM`, `PreTrainedModelForSeq2Seq` and any model
    that implements the `transformers` model API.

    Parameters
    ----------
    model
        A `transformers.PreTrainedModel` instance.
    tokenizer_or_processor
        A `transformers.PreTrainedTokenizer` or
        `transformers.ProcessorMixin` instance.
    device_dtype
        The dtype to use for the model. If not provided, the model will use
        the default dtype.

    Returns
    -------
    Union[Transformers, TransformersMultiModal]
        An Outlines `Transformers` or `TransformersMultiModal` model instance.

    """
    from transformers import (
        PreTrainedTokenizer, PreTrainedTokenizerFast, ProcessorMixin)

    if isinstance(
        tokenizer_or_processor, (PreTrainedTokenizer, PreTrainedTokenizerFast)
    ):
        tokenizer = tokenizer_or_processor
        return Transformers(model, tokenizer, device_dtype=device_dtype)
    elif isinstance(tokenizer_or_processor, ProcessorMixin):
        processor = tokenizer_or_processor
        return TransformersMultiModal(model, processor, device_dtype=device_dtype)
    else:
        raise ValueError(
            "We could determine whether the model passed to `from_transformers`"
            + " is a text-2-text or a multi-modal model. Please provide a "
            + "a transformers tokenizer or processor."
        )

from_vllm(client, model_name=None)

Create an Outlines VLLM or AsyncVLLM model instance from an openai.OpenAI or openai.AsyncOpenAI instance.

Parameters:

Name Type Description Default
client Union[OpenAI, AsyncOpenAI]

An openai.OpenAI or openai.AsyncOpenAI instance.

required
model_name Optional[str]

The name of the model to use.

None

Returns:

Type Description
Union[VLLM, AsyncVLLM]

An Outlines VLLM or AsyncVLLM model instance.

Source code in outlines/models/vllm.py
def from_vllm(
    client: Union["OpenAI", "AsyncOpenAI"],
    model_name: Optional[str] = None,
) -> Union[VLLM, AsyncVLLM]:
    """Create an Outlines `VLLM` or `AsyncVLLM` model instance from an
    `openai.OpenAI` or `openai.AsyncOpenAI` instance.

    Parameters
    ----------
    client
        An `openai.OpenAI` or `openai.AsyncOpenAI` instance.
    model_name
        The name of the model to use.

    Returns
    -------
    Union[VLLM, AsyncVLLM]
        An Outlines `VLLM` or `AsyncVLLM` model instance.

    """
    from openai import AsyncOpenAI, OpenAI

    if isinstance(client, OpenAI):
        return VLLM(client, model_name)
    elif isinstance(client, AsyncOpenAI):
        return AsyncVLLM(client, model_name)
    else:
        raise ValueError(
            f"Unsupported client type: {type(client)}.\n"
            "Please provide an OpenAI or AsyncOpenAI instance."
        )

from_vllm_offline(model)

Create an Outlines VLLMOffline model instance from a vllm.LLM instance.

Parameters:

Name Type Description Default
model LLM

A vllm.LLM instance.

required

Returns:

Type Description
VLLMOffline

An Outlines VLLMOffline model instance.

Source code in outlines/models/vllm_offline.py
def from_vllm_offline(model: "LLM") -> VLLMOffline:
    """Create an Outlines `VLLMOffline` model instance from a `vllm.LLM`
    instance.

    Parameters
    ----------
    model
        A `vllm.LLM` instance.

    Returns
    -------
    VLLMOffline
        An Outlines `VLLMOffline` model instance.

    """
    return VLLMOffline(model)

applications

Encapsulate a prompt template and an output type into a reusable object.

Application

Application is a class that encapsulates a prompt template and an output type. It can be called to generate a response by providing a model, the values to be substituted in the template in a dictionary and optional inference parameters.

Parameters:

Name Type Description Default
template Union[Template, Callable]

A callable that takes arguments and returns a prompt string.

required
output_type Any

The expected output type of the generated response.

None

Examples:

from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from outlines import models, Application
from outlines.types import JsonType
from outlines.templates import Template

class OutputModel(BaseModel):
    result: int

model = models.from_transformers(
    AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)

template_string = "What is 2 times {{ num }}?"
template = Template.from_string(template_string)

application = Application(template, JsonType(OutputModel))

result = application(model, {"num": 3}, max_new_tokens=20)
print(result)  # Expected output: { "result" : 6 }
Source code in outlines/applications.py
class Application:
    """
    Application is a class that encapsulates a prompt template and an
    output type. It can be called to generate a response by providing a
    model, the values to be substituted in the template in a dictionary
    and optional inference parameters.

    Parameters
    ----------
    template : Union[Template, Callable]
        A callable that takes arguments and returns a prompt string.
    output_type : Any
        The expected output type of the generated response.

    Examples
    --------
    ```python
    from pydantic import BaseModel
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from outlines import models, Application
    from outlines.types import JsonType
    from outlines.templates import Template

    class OutputModel(BaseModel):
        result: int

    model = models.from_transformers(
        AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
        AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
    )

    template_string = "What is 2 times {{ num }}?"
    template = Template.from_string(template_string)

    application = Application(template, JsonType(OutputModel))

    result = application(model, {"num": 3}, max_new_tokens=20)
    print(result)  # Expected output: { "result" : 6 }
    ```

    """
    def __init__(
        self,
        template: Union[Template, Callable],
        output_type: Optional[Any] = None,
    ):
        """
        Parameters
        ----------
        template
            The template to use to build the prompt.
        output_type
            The output type provided to the generator.

        """
        self.template = template
        self.output_type = output_type
        self.generator: Optional[Union[
            BlackBoxGenerator, SteerableGenerator
        ]] = None
        self.model: Optional[Model] = None

    def __call__(
        self,
        model: Model,
        template_vars: Dict[str, Any],
        **inference_kwargs
    ) -> Any:
        """
        Parameters
        ----------
        model
            The model to use to generate the response.
        template_vars
            The variables to be substituted in the template.
        **inference_kwargs
            Additional keyword arguments to pass to the model.
        Returns
        -------
        Any
            The generated response.
        """
        if model is None:
            raise ValueError("you must provide a model")
        # We save the generator to avoid creating a new one for each call.
        # If the model has changed since the last call, we create a new
        # generator.
        if model != self.model:
            self.model = model
            self.generator = Generator(model, self.output_type)  # type: ignore

        prompt = self.template(**template_vars)
        assert self.generator is not None
        return self.generator(prompt, **inference_kwargs)

__call__(model, template_vars, **inference_kwargs)

Parameters:

Name Type Description Default
model Model

The model to use to generate the response.

required
template_vars Dict[str, Any]

The variables to be substituted in the template.

required
**inference_kwargs

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
Any

The generated response.

Source code in outlines/applications.py
def __call__(
    self,
    model: Model,
    template_vars: Dict[str, Any],
    **inference_kwargs
) -> Any:
    """
    Parameters
    ----------
    model
        The model to use to generate the response.
    template_vars
        The variables to be substituted in the template.
    **inference_kwargs
        Additional keyword arguments to pass to the model.
    Returns
    -------
    Any
        The generated response.
    """
    if model is None:
        raise ValueError("you must provide a model")
    # We save the generator to avoid creating a new one for each call.
    # If the model has changed since the last call, we create a new
    # generator.
    if model != self.model:
        self.model = model
        self.generator = Generator(model, self.output_type)  # type: ignore

    prompt = self.template(**template_vars)
    assert self.generator is not None
    return self.generator(prompt, **inference_kwargs)

__init__(template, output_type=None)

Parameters:

Name Type Description Default
template Union[Template, Callable]

The template to use to build the prompt.

required
output_type Optional[Any]

The output type provided to the generator.

None
Source code in outlines/applications.py
def __init__(
    self,
    template: Union[Template, Callable],
    output_type: Optional[Any] = None,
):
    """
    Parameters
    ----------
    template
        The template to use to build the prompt.
    output_type
        The output type provided to the generator.

    """
    self.template = template
    self.output_type = output_type
    self.generator: Optional[Union[
        BlackBoxGenerator, SteerableGenerator
    ]] = None
    self.model: Optional[Model] = None

backends

Module to define the backends in charge of creating logits processors.

BaseBackend

Bases: ABC

Base class for all backends.

The subclasses must implement methods that create a logits processor from a JSON schema, regex or CFG.

Source code in outlines/backends/base.py
class BaseBackend(ABC):
    """Base class for all backends.

    The subclasses must implement methods that create a logits processor
    from a JSON schema, regex or CFG.

    """

    @abstractmethod
    def get_json_schema_logits_processor(
        self, json_schema: str
    ) -> LogitsProcessorType:
        """Create a logits processor from a JSON schema.

        Parameters
        ----------
        json_schema: str
            The JSON schema to create a logits processor from.

        Returns
        -------
        LogitsProcessorType
            The logits processor.

        """
        ...

    @abstractmethod
    def get_regex_logits_processor(self, regex: str) -> LogitsProcessorType:
        """Create a logits processor from a regex.

        Parameters
        ----------
        regex: str
            The regex to create a logits processor from.

        Returns
        -------
        LogitsProcessorType
            The logits processor.

        """
        ...

    @abstractmethod
    def get_cfg_logits_processor(self, grammar: str) -> LogitsProcessorType:
        """Create a logits processor from a context-free grammar.

        Parameters
        ----------
        grammar: str
            The context-free grammar to create a logits processor from.

        Returns
        -------
        LogitsProcessorType
            The logits processor.

        """
        ...

get_cfg_logits_processor(grammar) abstractmethod

Create a logits processor from a context-free grammar.

Parameters:

Name Type Description Default
grammar str

The context-free grammar to create a logits processor from.

required

Returns:

Type Description
LogitsProcessorType

The logits processor.

Source code in outlines/backends/base.py
@abstractmethod
def get_cfg_logits_processor(self, grammar: str) -> LogitsProcessorType:
    """Create a logits processor from a context-free grammar.

    Parameters
    ----------
    grammar: str
        The context-free grammar to create a logits processor from.

    Returns
    -------
    LogitsProcessorType
        The logits processor.

    """
    ...

get_json_schema_logits_processor(json_schema) abstractmethod

Create a logits processor from a JSON schema.

Parameters:

Name Type Description Default
json_schema str

The JSON schema to create a logits processor from.

required

Returns:

Type Description
LogitsProcessorType

The logits processor.

Source code in outlines/backends/base.py
@abstractmethod
def get_json_schema_logits_processor(
    self, json_schema: str
) -> LogitsProcessorType:
    """Create a logits processor from a JSON schema.

    Parameters
    ----------
    json_schema: str
        The JSON schema to create a logits processor from.

    Returns
    -------
    LogitsProcessorType
        The logits processor.

    """
    ...

get_regex_logits_processor(regex) abstractmethod

Create a logits processor from a regex.

Parameters:

Name Type Description Default
regex str

The regex to create a logits processor from.

required

Returns:

Type Description
LogitsProcessorType

The logits processor.

Source code in outlines/backends/base.py
@abstractmethod
def get_regex_logits_processor(self, regex: str) -> LogitsProcessorType:
    """Create a logits processor from a regex.

    Parameters
    ----------
    regex: str
        The regex to create a logits processor from.

    Returns
    -------
    LogitsProcessorType
        The logits processor.

    """
    ...

LLGuidanceBackend

Bases: BaseBackend

Backend for LLGuidance.

Source code in outlines/backends/llguidance.py
class LLGuidanceBackend(BaseBackend):
    """Backend for LLGuidance."""

    def __init__(self, model: SteerableModel):
        """
        Parameters
        ----------
        model
            The Outlines model of the user.

        """
        import llguidance as llg

        self.llg = llg
        self.tensor_library_name = model.tensor_library_name
        self.llg_tokenizer = self._create_llg_tokenizer(model)

    def _create_llg_tokenizer(self, model: SteerableModel) -> "LLGTokenizer":
        """Create an llg tokenizer from the Outlines model's tokenizer.

        Parameters
        ----------
        model: Model
            The Outlines model.

        Returns
        -------
        LLGTokenizer
            The llg tokenizer.

        """
        if isinstance(model, Transformers):
            import llguidance.hf

            return llguidance.hf.from_tokenizer(model.hf_tokenizer)

        elif isinstance(model, LlamaCpp):
            import llama_cpp
            import llguidance.llamacpp

            vocab = llama_cpp.llama_model_get_vocab(model.model.model)
            return llguidance.llamacpp.lltokenizer_from_vocab(vocab)

        elif isinstance(model, MLXLM): # pragma: no cover
            import llguidance.hf

            return llguidance.hf.from_tokenizer(
                model.mlx_tokenizer._tokenizer
            )

        else: # pragma: no cover
            raise ValueError(
                f"Unsupported model type: {type(model)}. "
                "Llguidance only supports LlamaCpp, MLXLM "
                "and Transformers models."
            )

    def get_json_schema_logits_processor(
        self, json_schema: str
    ) -> LLGuidanceLogitsProcessor:
        """Create a logits processor from a JSON schema.

        Parameters
        ----------
        json_schema: str
            The JSON schema to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        grammar_spec = self.llg.grammar_from("json_schema", json_schema)
        return LLGuidanceLogitsProcessor(
            grammar_spec, self.llg_tokenizer, self.tensor_library_name
        )

    def get_regex_logits_processor(
        self, regex: str
    ) -> LLGuidanceLogitsProcessor:
        """Create a logits processor from a regex.

        Parameters
        ----------
        regex: str
            The regex to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        grammar_spec = self.llg.grammar_from("regex", regex)
        return LLGuidanceLogitsProcessor(
            grammar_spec, self.llg_tokenizer, self.tensor_library_name
        )

    def get_cfg_logits_processor(
        self, grammar: str
    ) -> LLGuidanceLogitsProcessor:
        """Create a logits processor from a context-free grammar.

        Parameters
        ----------
        grammar: str
            The context-free grammar to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        # We try both lark and ebnf
        try:
            grammar_spec = self.llg.grammar_from("grammar", grammar)
        except ValueError:
            grammar_spec = self.llg.grammar_from("lark", grammar)
        return LLGuidanceLogitsProcessor(
            grammar_spec, self.llg_tokenizer, self.tensor_library_name
        )

__init__(model)

Parameters:

Name Type Description Default
model SteerableModel

The Outlines model of the user.

required
Source code in outlines/backends/llguidance.py
def __init__(self, model: SteerableModel):
    """
    Parameters
    ----------
    model
        The Outlines model of the user.

    """
    import llguidance as llg

    self.llg = llg
    self.tensor_library_name = model.tensor_library_name
    self.llg_tokenizer = self._create_llg_tokenizer(model)

get_cfg_logits_processor(grammar)

Create a logits processor from a context-free grammar.

Parameters:

Name Type Description Default
grammar str

The context-free grammar to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/llguidance.py
def get_cfg_logits_processor(
    self, grammar: str
) -> LLGuidanceLogitsProcessor:
    """Create a logits processor from a context-free grammar.

    Parameters
    ----------
    grammar: str
        The context-free grammar to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    # We try both lark and ebnf
    try:
        grammar_spec = self.llg.grammar_from("grammar", grammar)
    except ValueError:
        grammar_spec = self.llg.grammar_from("lark", grammar)
    return LLGuidanceLogitsProcessor(
        grammar_spec, self.llg_tokenizer, self.tensor_library_name
    )

get_json_schema_logits_processor(json_schema)

Create a logits processor from a JSON schema.

Parameters:

Name Type Description Default
json_schema str

The JSON schema to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/llguidance.py
def get_json_schema_logits_processor(
    self, json_schema: str
) -> LLGuidanceLogitsProcessor:
    """Create a logits processor from a JSON schema.

    Parameters
    ----------
    json_schema: str
        The JSON schema to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    grammar_spec = self.llg.grammar_from("json_schema", json_schema)
    return LLGuidanceLogitsProcessor(
        grammar_spec, self.llg_tokenizer, self.tensor_library_name
    )

get_regex_logits_processor(regex)

Create a logits processor from a regex.

Parameters:

Name Type Description Default
regex str

The regex to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/llguidance.py
def get_regex_logits_processor(
    self, regex: str
) -> LLGuidanceLogitsProcessor:
    """Create a logits processor from a regex.

    Parameters
    ----------
    regex: str
        The regex to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    grammar_spec = self.llg.grammar_from("regex", regex)
    return LLGuidanceLogitsProcessor(
        grammar_spec, self.llg_tokenizer, self.tensor_library_name
    )

OutlinesCoreBackend

Bases: BaseBackend

Backend for Outlines Core.

Source code in outlines/backends/outlines_core.py
class OutlinesCoreBackend(BaseBackend):
    """Backend for Outlines Core."""

    def __init__(self, model: SteerableModel):
        """
        Parameters
        ----------
        model
            The Outlines model of the user.

        """
        if isinstance(model, Transformers):
            tokenizer = model.tokenizer
            vocabulary = tokenizer.get_vocab()
            eos_token_id = tokenizer.eos_token_id
            eos_token = tokenizer.eos_token
            token_to_str = tokenizer.convert_token_to_string
        elif isinstance(model, LlamaCpp):
            tokenizer = model.tokenizer # type: ignore
            vocabulary = tokenizer.vocabulary
            eos_token_id = tokenizer.eos_token_id
            eos_token = tokenizer.eos_token
            token_to_str = tokenizer.convert_token_to_string
        elif isinstance(model, MLXLM): # pragma: no cover
            tokenizer = model.mlx_tokenizer # type: ignore
            vocabulary = tokenizer.get_vocab()
            eos_token_id = tokenizer.eos_token_id
            eos_token = tokenizer.eos_token
            token_to_str = lambda token: tokenizer.convert_tokens_to_string([token]) # type: ignore
        else: # pragma: no cover
            raise ValueError(f"Unsupported model type: {type(model)}")

        self.eos_token_id = eos_token_id
        self.vocabulary = self.create_outlines_core_vocabulary(
            vocabulary, eos_token_id, eos_token, token_to_str
        )
        self.tensor_library_name = model.tensor_library_name

    def get_json_schema_logits_processor(
        self, json_schema: str
    ):
        """Create a logits processor from a JSON schema.

        Parameters
        ----------
        json_schema: str
            The JSON schema to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        regex = build_regex_from_schema(json_schema)
        return self.get_regex_logits_processor(regex)

    def get_regex_logits_processor(self, regex: str):
        """Create a logits processor from a regex.

        Parameters
        ----------
        regex: str
            The regex to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        index = Index(regex, self.vocabulary)
        return OutlinesCoreLogitsProcessor(index, self.tensor_library_name)

    def get_cfg_logits_processor(self, grammar):
        raise NotImplementedError(
            "Outlines Core does not support context-free grammar."
        )

    @staticmethod
    def create_outlines_core_vocabulary(
        vocab: Dict[str, int],
        eos_token_id: int,
        eos_token: str,
        token_to_str: Callable[[str], str]
    ) -> Vocabulary:
        """Create an Outlines Core Vocabulary instance.

        Parameters
        ----------
        vocab: Dict[str, int]
            The vocabulary to create an Outlines Core vocabulary from.
        eos_token_id: int
            The EOS token ID.
        eos_token: str
            The EOS token.
        token_to_str: Callable[[str], str]
            The function to convert a token to a string.

        Returns
        -------
        Vocabulary
            The Outlines Core Vocabulary instance.

        """
        formatted_vocab = {}
        for token, token_id in vocab.items():
            # This step is necessary to transform special tokens into their
            # string representation, in particular for spacing. We need those
            # string representations as outlines core first builds an FSM from
            # the regex provided that only contains regular strings.
            token_as_str = token_to_str(token)
            formatted_vocab[token_as_str] = [token_id]
        formatted_vocab.pop(eos_token)
        return Vocabulary(eos_token_id, formatted_vocab)

__init__(model)

Parameters:

Name Type Description Default
model SteerableModel

The Outlines model of the user.

required
Source code in outlines/backends/outlines_core.py
def __init__(self, model: SteerableModel):
    """
    Parameters
    ----------
    model
        The Outlines model of the user.

    """
    if isinstance(model, Transformers):
        tokenizer = model.tokenizer
        vocabulary = tokenizer.get_vocab()
        eos_token_id = tokenizer.eos_token_id
        eos_token = tokenizer.eos_token
        token_to_str = tokenizer.convert_token_to_string
    elif isinstance(model, LlamaCpp):
        tokenizer = model.tokenizer # type: ignore
        vocabulary = tokenizer.vocabulary
        eos_token_id = tokenizer.eos_token_id
        eos_token = tokenizer.eos_token
        token_to_str = tokenizer.convert_token_to_string
    elif isinstance(model, MLXLM): # pragma: no cover
        tokenizer = model.mlx_tokenizer # type: ignore
        vocabulary = tokenizer.get_vocab()
        eos_token_id = tokenizer.eos_token_id
        eos_token = tokenizer.eos_token
        token_to_str = lambda token: tokenizer.convert_tokens_to_string([token]) # type: ignore
    else: # pragma: no cover
        raise ValueError(f"Unsupported model type: {type(model)}")

    self.eos_token_id = eos_token_id
    self.vocabulary = self.create_outlines_core_vocabulary(
        vocabulary, eos_token_id, eos_token, token_to_str
    )
    self.tensor_library_name = model.tensor_library_name

create_outlines_core_vocabulary(vocab, eos_token_id, eos_token, token_to_str) staticmethod

Create an Outlines Core Vocabulary instance.

Parameters:

Name Type Description Default
vocab Dict[str, int]

The vocabulary to create an Outlines Core vocabulary from.

required
eos_token_id int

The EOS token ID.

required
eos_token str

The EOS token.

required
token_to_str Callable[[str], str]

The function to convert a token to a string.

required

Returns:

Type Description
Vocabulary

The Outlines Core Vocabulary instance.

Source code in outlines/backends/outlines_core.py
@staticmethod
def create_outlines_core_vocabulary(
    vocab: Dict[str, int],
    eos_token_id: int,
    eos_token: str,
    token_to_str: Callable[[str], str]
) -> Vocabulary:
    """Create an Outlines Core Vocabulary instance.

    Parameters
    ----------
    vocab: Dict[str, int]
        The vocabulary to create an Outlines Core vocabulary from.
    eos_token_id: int
        The EOS token ID.
    eos_token: str
        The EOS token.
    token_to_str: Callable[[str], str]
        The function to convert a token to a string.

    Returns
    -------
    Vocabulary
        The Outlines Core Vocabulary instance.

    """
    formatted_vocab = {}
    for token, token_id in vocab.items():
        # This step is necessary to transform special tokens into their
        # string representation, in particular for spacing. We need those
        # string representations as outlines core first builds an FSM from
        # the regex provided that only contains regular strings.
        token_as_str = token_to_str(token)
        formatted_vocab[token_as_str] = [token_id]
    formatted_vocab.pop(eos_token)
    return Vocabulary(eos_token_id, formatted_vocab)

get_json_schema_logits_processor(json_schema)

Create a logits processor from a JSON schema.

Parameters:

Name Type Description Default
json_schema str

The JSON schema to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/outlines_core.py
def get_json_schema_logits_processor(
    self, json_schema: str
):
    """Create a logits processor from a JSON schema.

    Parameters
    ----------
    json_schema: str
        The JSON schema to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    regex = build_regex_from_schema(json_schema)
    return self.get_regex_logits_processor(regex)

get_regex_logits_processor(regex)

Create a logits processor from a regex.

Parameters:

Name Type Description Default
regex str

The regex to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/outlines_core.py
def get_regex_logits_processor(self, regex: str):
    """Create a logits processor from a regex.

    Parameters
    ----------
    regex: str
        The regex to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    index = Index(regex, self.vocabulary)
    return OutlinesCoreLogitsProcessor(index, self.tensor_library_name)

XGrammarBackend

Bases: BaseBackend

Backend for XGrammar.

Source code in outlines/backends/xgrammar.py
class XGrammarBackend(BaseBackend):
    """Backend for XGrammar."""

    def __init__(self, model: SteerableModel):
        """
        Parameters
        ----------
        model
            The Outlines model of the user.

        """
        import xgrammar as xgr

        if isinstance(model, Transformers):
            tokenizer = model.hf_tokenizer
        elif isinstance(model, MLXLM): # pragma: no cover
            tokenizer = model.mlx_tokenizer._tokenizer
        else: # pragma: no cover
            raise ValueError(
                "The xgrammar backend only supports Transformers and "
                + "MLXLM models"
            )

        tokenizer_info = xgr.TokenizerInfo.from_huggingface(
            tokenizer,
            vocab_size=len(tokenizer.get_vocab())
        )
        self.grammar_compiler = xgr.GrammarCompiler(tokenizer_info)
        self.tensor_library_name = model.tensor_library_name

    def get_json_schema_logits_processor(
        self, json_schema: str
    ) -> XGrammarLogitsProcessor:
        """Create a logits processor from a JSON schema.

        Parameters
        ----------
        json_schema: str
            The JSON schema to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        compiled_grammar = self.grammar_compiler.compile_json_schema(
            json_schema
        )
        return XGrammarLogitsProcessor(
            compiled_grammar,
            self.tensor_library_name
        )

    def get_regex_logits_processor(
        self, regex: str
    ) -> XGrammarLogitsProcessor:
        """Create a logits processor from a regex.

        Parameters
        ----------
        regex: str
            The regex to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        compiled_grammar = self.grammar_compiler.compile_regex(regex)
        return XGrammarLogitsProcessor(
            compiled_grammar,
            self.tensor_library_name
        )

    def get_cfg_logits_processor(
        self, grammar: str
    ) -> XGrammarLogitsProcessor:
        """Create a logits processor from a context-free grammar.

        Parameters
        ----------
        grammar: str
            The context-free grammar to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        compiled_grammar = self.grammar_compiler.compile_grammar(grammar)
        return XGrammarLogitsProcessor(
            compiled_grammar,
            self.tensor_library_name
        )

__init__(model)

Parameters:

Name Type Description Default
model SteerableModel

The Outlines model of the user.

required
Source code in outlines/backends/xgrammar.py
def __init__(self, model: SteerableModel):
    """
    Parameters
    ----------
    model
        The Outlines model of the user.

    """
    import xgrammar as xgr

    if isinstance(model, Transformers):
        tokenizer = model.hf_tokenizer
    elif isinstance(model, MLXLM): # pragma: no cover
        tokenizer = model.mlx_tokenizer._tokenizer
    else: # pragma: no cover
        raise ValueError(
            "The xgrammar backend only supports Transformers and "
            + "MLXLM models"
        )

    tokenizer_info = xgr.TokenizerInfo.from_huggingface(
        tokenizer,
        vocab_size=len(tokenizer.get_vocab())
    )
    self.grammar_compiler = xgr.GrammarCompiler(tokenizer_info)
    self.tensor_library_name = model.tensor_library_name

get_cfg_logits_processor(grammar)

Create a logits processor from a context-free grammar.

Parameters:

Name Type Description Default
grammar str

The context-free grammar to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/xgrammar.py
def get_cfg_logits_processor(
    self, grammar: str
) -> XGrammarLogitsProcessor:
    """Create a logits processor from a context-free grammar.

    Parameters
    ----------
    grammar: str
        The context-free grammar to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    compiled_grammar = self.grammar_compiler.compile_grammar(grammar)
    return XGrammarLogitsProcessor(
        compiled_grammar,
        self.tensor_library_name
    )

get_json_schema_logits_processor(json_schema)

Create a logits processor from a JSON schema.

Parameters:

Name Type Description Default
json_schema str

The JSON schema to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/xgrammar.py
def get_json_schema_logits_processor(
    self, json_schema: str
) -> XGrammarLogitsProcessor:
    """Create a logits processor from a JSON schema.

    Parameters
    ----------
    json_schema: str
        The JSON schema to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    compiled_grammar = self.grammar_compiler.compile_json_schema(
        json_schema
    )
    return XGrammarLogitsProcessor(
        compiled_grammar,
        self.tensor_library_name
    )

get_regex_logits_processor(regex)

Create a logits processor from a regex.

Parameters:

Name Type Description Default
regex str

The regex to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/xgrammar.py
def get_regex_logits_processor(
    self, regex: str
) -> XGrammarLogitsProcessor:
    """Create a logits processor from a regex.

    Parameters
    ----------
    regex: str
        The regex to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    compiled_grammar = self.grammar_compiler.compile_regex(regex)
    return XGrammarLogitsProcessor(
        compiled_grammar,
        self.tensor_library_name
    )

get_cfg_logits_processor(backend_name, model, grammar)

Create a logits processor from a context-free grammar.

Parameters:

Name Type Description Default
backend_name str | None

The name of the backend to use.

required
model SteerableModel

The Outlines model of the user.

required
grammar str

The context-free grammar to create a logits processor from.

required

Returns:

Type Description
LogitsProcessorType

The logits processor.

Source code in outlines/backends/__init__.py
def get_cfg_logits_processor(
    backend_name: str | None,
    model: SteerableModel,
    grammar: str,
) -> LogitsProcessorType:
    """Create a logits processor from a context-free grammar.

    Parameters
    ----------
    backend_name: str | None
        The name of the backend to use.
    model: Model
        The Outlines model of the user.
    grammar: str
        The context-free grammar to create a logits processor from.

    Returns
    -------
    LogitsProcessorType
        The logits processor.

    """
    backend = _get_backend(
        backend_name or CFG_DEFAULT_BACKEND,
        model,
    )
    return backend.get_cfg_logits_processor(grammar)

get_json_schema_logits_processor(backend_name, model, json_schema)

Create a logits processor from a JSON schema.

Parameters:

Name Type Description Default
backend_name str | None

The name of the backend to use.

required
model SteerableModel

The Outlines model of the user.

required
json_schema str

The JSON schema to create a logits processor from.

required

Returns:

Type Description
LogitsProcessorType

The logits processor.

Source code in outlines/backends/__init__.py
def get_json_schema_logits_processor(
    backend_name: str | None,
    model: SteerableModel,
    json_schema: str,
) -> LogitsProcessorType:
    """Create a logits processor from a JSON schema.

    Parameters
    ----------
    backend_name: str | None
        The name of the backend to use.
    model: Model
        The Outlines model of the user.
    json_schema: str
        The JSON schema to create a logits processor from.

    Returns
    -------
    LogitsProcessorType
        The logits processor.

    """
    backend = _get_backend(
        backend_name or JSON_SCHEMA_DEFAULT_BACKEND,
        model,
    )
    return backend.get_json_schema_logits_processor(json_schema)

get_regex_logits_processor(backend_name, model, regex)

Create a logits processor from a regex.

Parameters:

Name Type Description Default
backend_name str | None

The name of the backend to use.

required
model SteerableModel

The Outlines model of the user.

required
regex str

The regex to create a logits processor from.

required

Returns:

Type Description
LogitsProcessorType

The logits processor.

Source code in outlines/backends/__init__.py
def get_regex_logits_processor(
    backend_name: str | None,
    model: SteerableModel,
    regex: str,
) -> LogitsProcessorType:
    """Create a logits processor from a regex.

    Parameters
    ----------
    backend_name: str | None
        The name of the backend to use.
    model: Model
        The Outlines model of the user.
    regex: str
        The regex to create a logits processor from.

    Returns
    -------
    LogitsProcessorType
        The logits processor.

    """
    backend = _get_backend(
        backend_name or REGEX_DEFAULT_BACKEND,
        model,
    )
    return backend.get_regex_logits_processor(regex)

base

Base class for all backends.

BaseBackend

Bases: ABC

Base class for all backends.

The subclasses must implement methods that create a logits processor from a JSON schema, regex or CFG.

Source code in outlines/backends/base.py
class BaseBackend(ABC):
    """Base class for all backends.

    The subclasses must implement methods that create a logits processor
    from a JSON schema, regex or CFG.

    """

    @abstractmethod
    def get_json_schema_logits_processor(
        self, json_schema: str
    ) -> LogitsProcessorType:
        """Create a logits processor from a JSON schema.

        Parameters
        ----------
        json_schema: str
            The JSON schema to create a logits processor from.

        Returns
        -------
        LogitsProcessorType
            The logits processor.

        """
        ...

    @abstractmethod
    def get_regex_logits_processor(self, regex: str) -> LogitsProcessorType:
        """Create a logits processor from a regex.

        Parameters
        ----------
        regex: str
            The regex to create a logits processor from.

        Returns
        -------
        LogitsProcessorType
            The logits processor.

        """
        ...

    @abstractmethod
    def get_cfg_logits_processor(self, grammar: str) -> LogitsProcessorType:
        """Create a logits processor from a context-free grammar.

        Parameters
        ----------
        grammar: str
            The context-free grammar to create a logits processor from.

        Returns
        -------
        LogitsProcessorType
            The logits processor.

        """
        ...
get_cfg_logits_processor(grammar) abstractmethod

Create a logits processor from a context-free grammar.

Parameters:

Name Type Description Default
grammar str

The context-free grammar to create a logits processor from.

required

Returns:

Type Description
LogitsProcessorType

The logits processor.

Source code in outlines/backends/base.py
@abstractmethod
def get_cfg_logits_processor(self, grammar: str) -> LogitsProcessorType:
    """Create a logits processor from a context-free grammar.

    Parameters
    ----------
    grammar: str
        The context-free grammar to create a logits processor from.

    Returns
    -------
    LogitsProcessorType
        The logits processor.

    """
    ...
get_json_schema_logits_processor(json_schema) abstractmethod

Create a logits processor from a JSON schema.

Parameters:

Name Type Description Default
json_schema str

The JSON schema to create a logits processor from.

required

Returns:

Type Description
LogitsProcessorType

The logits processor.

Source code in outlines/backends/base.py
@abstractmethod
def get_json_schema_logits_processor(
    self, json_schema: str
) -> LogitsProcessorType:
    """Create a logits processor from a JSON schema.

    Parameters
    ----------
    json_schema: str
        The JSON schema to create a logits processor from.

    Returns
    -------
    LogitsProcessorType
        The logits processor.

    """
    ...
get_regex_logits_processor(regex) abstractmethod

Create a logits processor from a regex.

Parameters:

Name Type Description Default
regex str

The regex to create a logits processor from.

required

Returns:

Type Description
LogitsProcessorType

The logits processor.

Source code in outlines/backends/base.py
@abstractmethod
def get_regex_logits_processor(self, regex: str) -> LogitsProcessorType:
    """Create a logits processor from a regex.

    Parameters
    ----------
    regex: str
        The regex to create a logits processor from.

    Returns
    -------
    LogitsProcessorType
        The logits processor.

    """
    ...

llguidance

Backend class for LLGuidance.

LLGuidanceBackend

Bases: BaseBackend

Backend for LLGuidance.

Source code in outlines/backends/llguidance.py
class LLGuidanceBackend(BaseBackend):
    """Backend for LLGuidance."""

    def __init__(self, model: SteerableModel):
        """
        Parameters
        ----------
        model
            The Outlines model of the user.

        """
        import llguidance as llg

        self.llg = llg
        self.tensor_library_name = model.tensor_library_name
        self.llg_tokenizer = self._create_llg_tokenizer(model)

    def _create_llg_tokenizer(self, model: SteerableModel) -> "LLGTokenizer":
        """Create an llg tokenizer from the Outlines model's tokenizer.

        Parameters
        ----------
        model: Model
            The Outlines model.

        Returns
        -------
        LLGTokenizer
            The llg tokenizer.

        """
        if isinstance(model, Transformers):
            import llguidance.hf

            return llguidance.hf.from_tokenizer(model.hf_tokenizer)

        elif isinstance(model, LlamaCpp):
            import llama_cpp
            import llguidance.llamacpp

            vocab = llama_cpp.llama_model_get_vocab(model.model.model)
            return llguidance.llamacpp.lltokenizer_from_vocab(vocab)

        elif isinstance(model, MLXLM): # pragma: no cover
            import llguidance.hf

            return llguidance.hf.from_tokenizer(
                model.mlx_tokenizer._tokenizer
            )

        else: # pragma: no cover
            raise ValueError(
                f"Unsupported model type: {type(model)}. "
                "Llguidance only supports LlamaCpp, MLXLM "
                "and Transformers models."
            )

    def get_json_schema_logits_processor(
        self, json_schema: str
    ) -> LLGuidanceLogitsProcessor:
        """Create a logits processor from a JSON schema.

        Parameters
        ----------
        json_schema: str
            The JSON schema to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        grammar_spec = self.llg.grammar_from("json_schema", json_schema)
        return LLGuidanceLogitsProcessor(
            grammar_spec, self.llg_tokenizer, self.tensor_library_name
        )

    def get_regex_logits_processor(
        self, regex: str
    ) -> LLGuidanceLogitsProcessor:
        """Create a logits processor from a regex.

        Parameters
        ----------
        regex: str
            The regex to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        grammar_spec = self.llg.grammar_from("regex", regex)
        return LLGuidanceLogitsProcessor(
            grammar_spec, self.llg_tokenizer, self.tensor_library_name
        )

    def get_cfg_logits_processor(
        self, grammar: str
    ) -> LLGuidanceLogitsProcessor:
        """Create a logits processor from a context-free grammar.

        Parameters
        ----------
        grammar: str
            The context-free grammar to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        # We try both lark and ebnf
        try:
            grammar_spec = self.llg.grammar_from("grammar", grammar)
        except ValueError:
            grammar_spec = self.llg.grammar_from("lark", grammar)
        return LLGuidanceLogitsProcessor(
            grammar_spec, self.llg_tokenizer, self.tensor_library_name
        )
__init__(model)

Parameters:

Name Type Description Default
model SteerableModel

The Outlines model of the user.

required
Source code in outlines/backends/llguidance.py
def __init__(self, model: SteerableModel):
    """
    Parameters
    ----------
    model
        The Outlines model of the user.

    """
    import llguidance as llg

    self.llg = llg
    self.tensor_library_name = model.tensor_library_name
    self.llg_tokenizer = self._create_llg_tokenizer(model)
get_cfg_logits_processor(grammar)

Create a logits processor from a context-free grammar.

Parameters:

Name Type Description Default
grammar str

The context-free grammar to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/llguidance.py
def get_cfg_logits_processor(
    self, grammar: str
) -> LLGuidanceLogitsProcessor:
    """Create a logits processor from a context-free grammar.

    Parameters
    ----------
    grammar: str
        The context-free grammar to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    # We try both lark and ebnf
    try:
        grammar_spec = self.llg.grammar_from("grammar", grammar)
    except ValueError:
        grammar_spec = self.llg.grammar_from("lark", grammar)
    return LLGuidanceLogitsProcessor(
        grammar_spec, self.llg_tokenizer, self.tensor_library_name
    )
get_json_schema_logits_processor(json_schema)

Create a logits processor from a JSON schema.

Parameters:

Name Type Description Default
json_schema str

The JSON schema to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/llguidance.py
def get_json_schema_logits_processor(
    self, json_schema: str
) -> LLGuidanceLogitsProcessor:
    """Create a logits processor from a JSON schema.

    Parameters
    ----------
    json_schema: str
        The JSON schema to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    grammar_spec = self.llg.grammar_from("json_schema", json_schema)
    return LLGuidanceLogitsProcessor(
        grammar_spec, self.llg_tokenizer, self.tensor_library_name
    )
get_regex_logits_processor(regex)

Create a logits processor from a regex.

Parameters:

Name Type Description Default
regex str

The regex to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/llguidance.py
def get_regex_logits_processor(
    self, regex: str
) -> LLGuidanceLogitsProcessor:
    """Create a logits processor from a regex.

    Parameters
    ----------
    regex: str
        The regex to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    grammar_spec = self.llg.grammar_from("regex", regex)
    return LLGuidanceLogitsProcessor(
        grammar_spec, self.llg_tokenizer, self.tensor_library_name
    )

LLGuidanceLogitsProcessor

Bases: OutlinesLogitsProcessor

Logits Processor for the LLGuidance backend.

Source code in outlines/backends/llguidance.py
class LLGuidanceLogitsProcessor(OutlinesLogitsProcessor):
    """Logits Processor for the LLGuidance backend."""

    def __init__(
        self,
        grammar: str,
        llg_tokenizer,
        tensor_library_name: str,
    ) -> None:
        """
        Parameters
        ----------
        grammar: str
            The grammar spec to use to create the LLMatcher
        llg_tokenizer: LLTokenizer
            The LLGuidance tokenizer
        tensor_library_name: str
            The name of the tensor library used by the model

        """
        self.is_first_token = True
        self.grammar = grammar
        self.llg_tokenizer = llg_tokenizer
        self.tensor_library_name = tensor_library_name
        super().__init__(tensor_library_name)

    def reset(self):
        """Ensure self._setup is called again for the next generation."""
        self.is_first_token = True

    def _setup(self, batch_size: int) -> None:
        """Setup the LLMatchers, the bitmask and some functions used in the
        `process_logits` method.

        This method is called when the first token is generated instead of
        at initialization because we need to know the batch size.

        Parameters
        ----------
        batch_size: int
            The batch size of the input

        """
        from llguidance import LLMatcher

        self.ll_matchers = [
            LLMatcher(self.llg_tokenizer, self.grammar)
            for _ in range(batch_size)
        ]

        # we must adapt the bitmask creation and the bias function to the
        # tensor library used by the model
        if self.tensor_library_name == "torch":
            import llguidance.torch

            self.bitmask = llguidance.torch.allocate_token_bitmask(batch_size, self.llg_tokenizer.vocab_size)
            self._bias_logits = self._bias_logits_torch
        elif self.tensor_library_name == "numpy":
            import llguidance.numpy

            self.bitmask = llguidance.numpy.allocate_token_bitmask(batch_size, self.llg_tokenizer.vocab_size)
            self._bias_logits = self._bias_logits_numpy
        elif self.tensor_library_name == "mlx": # pragma: no cover
            import llguidance.numpy

            self.bitmask = llguidance.numpy.allocate_token_bitmask(batch_size, self.llg_tokenizer.vocab_size)
            self._bias_logits = self._bias_logits_mlx
        else: # pragma: no cover
            raise ValueError(f"Unsupported tensor library: {self.tensor_library_name}")

    def _bias_logits_mlx( # pragma: no cover
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Bias the logits for the MLX backend."""
        import llguidance.mlx
        import llguidance.numpy

        biased_logits_array = []
        for i in range(self.tensor_adapter.shape(input_ids)[0]):
            llguidance.numpy.fill_next_token_bitmask(self.ll_matchers[i], self.bitmask, i)
            biased_logits = llguidance.mlx.apply_token_bitmask(
                logits[i], self.bitmask[i] # type: ignore
            )
            biased_logits_array.append(biased_logits)

        return self.tensor_adapter.concatenate(biased_logits_array)

    def _bias_logits_torch(
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Bias the logits for the Torch backend."""
        import llguidance.torch

        for i in range(self.tensor_adapter.shape(input_ids)[0]):
            llguidance.torch.fill_next_token_bitmask(self.ll_matchers[i], self.bitmask, i)
            self.bitmask = self.tensor_adapter.to_device(
                self.bitmask,
                self.tensor_adapter.get_device(logits)
            )
            llguidance.torch.apply_token_bitmask_inplace(
                logits[i], # type: ignore
                self.bitmask[i]
            )
            self.bitmask = self.tensor_adapter.to_device(
                self.bitmask,
                "cpu"
            )

        return logits

    def _bias_logits_numpy(
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Bias the logits for the Numpy backend."""
        import llguidance.numpy

        for i in range(self.tensor_adapter.shape(input_ids)[0]):
            llguidance.numpy.fill_next_token_bitmask(self.ll_matchers[i], self.bitmask, i)
            llguidance.numpy.apply_token_bitmask_inplace(
                logits[i], self.bitmask[i] # type: ignore
            )

        return logits

    def process_logits(
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Use the instances of LLMatcher to bias the logits.

        Parameters
        ----------
        input_ids
            The ids of the tokens of the existing sequences.
        logits
            The logits for the current generation step.

        Returns
        -------
        TensorType
            The biased logits.

        """
        if self.is_first_token:
            self._setup(self.tensor_adapter.shape(input_ids)[0])
            self.is_first_token = False

        # we do not make the matchers consume the last token during the first
        # generation step because no tokens have been generated yet
        else:
            for i in range(self.tensor_adapter.shape(input_ids)[0]):
                sequence = input_ids[i] # type: ignore
                last_token = sequence[-1].item()
                self.ll_matchers[i].consume_token(last_token)
                error = self.ll_matchers[i].get_error()
                if error:
                    warnings.warn(f"Error in LLMatcher: {error}")

        return self._bias_logits(input_ids, logits)
__init__(grammar, llg_tokenizer, tensor_library_name)

Parameters:

Name Type Description Default
grammar str

The grammar spec to use to create the LLMatcher

required
llg_tokenizer

The LLGuidance tokenizer

required
tensor_library_name str

The name of the tensor library used by the model

required
Source code in outlines/backends/llguidance.py
def __init__(
    self,
    grammar: str,
    llg_tokenizer,
    tensor_library_name: str,
) -> None:
    """
    Parameters
    ----------
    grammar: str
        The grammar spec to use to create the LLMatcher
    llg_tokenizer: LLTokenizer
        The LLGuidance tokenizer
    tensor_library_name: str
        The name of the tensor library used by the model

    """
    self.is_first_token = True
    self.grammar = grammar
    self.llg_tokenizer = llg_tokenizer
    self.tensor_library_name = tensor_library_name
    super().__init__(tensor_library_name)
process_logits(input_ids, logits)

Use the instances of LLMatcher to bias the logits.

Parameters:

Name Type Description Default
input_ids TensorType

The ids of the tokens of the existing sequences.

required
logits TensorType

The logits for the current generation step.

required

Returns:

Type Description
TensorType

The biased logits.

Source code in outlines/backends/llguidance.py
def process_logits(
    self, input_ids: TensorType, logits: TensorType
) -> TensorType:
    """Use the instances of LLMatcher to bias the logits.

    Parameters
    ----------
    input_ids
        The ids of the tokens of the existing sequences.
    logits
        The logits for the current generation step.

    Returns
    -------
    TensorType
        The biased logits.

    """
    if self.is_first_token:
        self._setup(self.tensor_adapter.shape(input_ids)[0])
        self.is_first_token = False

    # we do not make the matchers consume the last token during the first
    # generation step because no tokens have been generated yet
    else:
        for i in range(self.tensor_adapter.shape(input_ids)[0]):
            sequence = input_ids[i] # type: ignore
            last_token = sequence[-1].item()
            self.ll_matchers[i].consume_token(last_token)
            error = self.ll_matchers[i].get_error()
            if error:
                warnings.warn(f"Error in LLMatcher: {error}")

    return self._bias_logits(input_ids, logits)
reset()

Ensure self._setup is called again for the next generation.

Source code in outlines/backends/llguidance.py
def reset(self):
    """Ensure self._setup is called again for the next generation."""
    self.is_first_token = True

outlines_core

Backend class for Outlines Core.

OutlinesCoreBackend

Bases: BaseBackend

Backend for Outlines Core.

Source code in outlines/backends/outlines_core.py
class OutlinesCoreBackend(BaseBackend):
    """Backend for Outlines Core."""

    def __init__(self, model: SteerableModel):
        """
        Parameters
        ----------
        model
            The Outlines model of the user.

        """
        if isinstance(model, Transformers):
            tokenizer = model.tokenizer
            vocabulary = tokenizer.get_vocab()
            eos_token_id = tokenizer.eos_token_id
            eos_token = tokenizer.eos_token
            token_to_str = tokenizer.convert_token_to_string
        elif isinstance(model, LlamaCpp):
            tokenizer = model.tokenizer # type: ignore
            vocabulary = tokenizer.vocabulary
            eos_token_id = tokenizer.eos_token_id
            eos_token = tokenizer.eos_token
            token_to_str = tokenizer.convert_token_to_string
        elif isinstance(model, MLXLM): # pragma: no cover
            tokenizer = model.mlx_tokenizer # type: ignore
            vocabulary = tokenizer.get_vocab()
            eos_token_id = tokenizer.eos_token_id
            eos_token = tokenizer.eos_token
            token_to_str = lambda token: tokenizer.convert_tokens_to_string([token]) # type: ignore
        else: # pragma: no cover
            raise ValueError(f"Unsupported model type: {type(model)}")

        self.eos_token_id = eos_token_id
        self.vocabulary = self.create_outlines_core_vocabulary(
            vocabulary, eos_token_id, eos_token, token_to_str
        )
        self.tensor_library_name = model.tensor_library_name

    def get_json_schema_logits_processor(
        self, json_schema: str
    ):
        """Create a logits processor from a JSON schema.

        Parameters
        ----------
        json_schema: str
            The JSON schema to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        regex = build_regex_from_schema(json_schema)
        return self.get_regex_logits_processor(regex)

    def get_regex_logits_processor(self, regex: str):
        """Create a logits processor from a regex.

        Parameters
        ----------
        regex: str
            The regex to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        index = Index(regex, self.vocabulary)
        return OutlinesCoreLogitsProcessor(index, self.tensor_library_name)

    def get_cfg_logits_processor(self, grammar):
        raise NotImplementedError(
            "Outlines Core does not support context-free grammar."
        )

    @staticmethod
    def create_outlines_core_vocabulary(
        vocab: Dict[str, int],
        eos_token_id: int,
        eos_token: str,
        token_to_str: Callable[[str], str]
    ) -> Vocabulary:
        """Create an Outlines Core Vocabulary instance.

        Parameters
        ----------
        vocab: Dict[str, int]
            The vocabulary to create an Outlines Core vocabulary from.
        eos_token_id: int
            The EOS token ID.
        eos_token: str
            The EOS token.
        token_to_str: Callable[[str], str]
            The function to convert a token to a string.

        Returns
        -------
        Vocabulary
            The Outlines Core Vocabulary instance.

        """
        formatted_vocab = {}
        for token, token_id in vocab.items():
            # This step is necessary to transform special tokens into their
            # string representation, in particular for spacing. We need those
            # string representations as outlines core first builds an FSM from
            # the regex provided that only contains regular strings.
            token_as_str = token_to_str(token)
            formatted_vocab[token_as_str] = [token_id]
        formatted_vocab.pop(eos_token)
        return Vocabulary(eos_token_id, formatted_vocab)
__init__(model)

Parameters:

Name Type Description Default
model SteerableModel

The Outlines model of the user.

required
Source code in outlines/backends/outlines_core.py
def __init__(self, model: SteerableModel):
    """
    Parameters
    ----------
    model
        The Outlines model of the user.

    """
    if isinstance(model, Transformers):
        tokenizer = model.tokenizer
        vocabulary = tokenizer.get_vocab()
        eos_token_id = tokenizer.eos_token_id
        eos_token = tokenizer.eos_token
        token_to_str = tokenizer.convert_token_to_string
    elif isinstance(model, LlamaCpp):
        tokenizer = model.tokenizer # type: ignore
        vocabulary = tokenizer.vocabulary
        eos_token_id = tokenizer.eos_token_id
        eos_token = tokenizer.eos_token
        token_to_str = tokenizer.convert_token_to_string
    elif isinstance(model, MLXLM): # pragma: no cover
        tokenizer = model.mlx_tokenizer # type: ignore
        vocabulary = tokenizer.get_vocab()
        eos_token_id = tokenizer.eos_token_id
        eos_token = tokenizer.eos_token
        token_to_str = lambda token: tokenizer.convert_tokens_to_string([token]) # type: ignore
    else: # pragma: no cover
        raise ValueError(f"Unsupported model type: {type(model)}")

    self.eos_token_id = eos_token_id
    self.vocabulary = self.create_outlines_core_vocabulary(
        vocabulary, eos_token_id, eos_token, token_to_str
    )
    self.tensor_library_name = model.tensor_library_name
create_outlines_core_vocabulary(vocab, eos_token_id, eos_token, token_to_str) staticmethod

Create an Outlines Core Vocabulary instance.

Parameters:

Name Type Description Default
vocab Dict[str, int]

The vocabulary to create an Outlines Core vocabulary from.

required
eos_token_id int

The EOS token ID.

required
eos_token str

The EOS token.

required
token_to_str Callable[[str], str]

The function to convert a token to a string.

required

Returns:

Type Description
Vocabulary

The Outlines Core Vocabulary instance.

Source code in outlines/backends/outlines_core.py
@staticmethod
def create_outlines_core_vocabulary(
    vocab: Dict[str, int],
    eos_token_id: int,
    eos_token: str,
    token_to_str: Callable[[str], str]
) -> Vocabulary:
    """Create an Outlines Core Vocabulary instance.

    Parameters
    ----------
    vocab: Dict[str, int]
        The vocabulary to create an Outlines Core vocabulary from.
    eos_token_id: int
        The EOS token ID.
    eos_token: str
        The EOS token.
    token_to_str: Callable[[str], str]
        The function to convert a token to a string.

    Returns
    -------
    Vocabulary
        The Outlines Core Vocabulary instance.

    """
    formatted_vocab = {}
    for token, token_id in vocab.items():
        # This step is necessary to transform special tokens into their
        # string representation, in particular for spacing. We need those
        # string representations as outlines core first builds an FSM from
        # the regex provided that only contains regular strings.
        token_as_str = token_to_str(token)
        formatted_vocab[token_as_str] = [token_id]
    formatted_vocab.pop(eos_token)
    return Vocabulary(eos_token_id, formatted_vocab)
get_json_schema_logits_processor(json_schema)

Create a logits processor from a JSON schema.

Parameters:

Name Type Description Default
json_schema str

The JSON schema to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/outlines_core.py
def get_json_schema_logits_processor(
    self, json_schema: str
):
    """Create a logits processor from a JSON schema.

    Parameters
    ----------
    json_schema: str
        The JSON schema to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    regex = build_regex_from_schema(json_schema)
    return self.get_regex_logits_processor(regex)
get_regex_logits_processor(regex)

Create a logits processor from a regex.

Parameters:

Name Type Description Default
regex str

The regex to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/outlines_core.py
def get_regex_logits_processor(self, regex: str):
    """Create a logits processor from a regex.

    Parameters
    ----------
    regex: str
        The regex to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    index = Index(regex, self.vocabulary)
    return OutlinesCoreLogitsProcessor(index, self.tensor_library_name)

OutlinesCoreLogitsProcessor

Bases: OutlinesLogitsProcessor

Logits processor for Outlines Core.

Source code in outlines/backends/outlines_core.py
class OutlinesCoreLogitsProcessor(OutlinesLogitsProcessor):
    """Logits processor for Outlines Core."""

    def __init__(
        self, index: Index, tensor_library_name: str
    ):
        """
        Parameters
        ----------
        index: Index
            The Outlines Core `Index` instance to use to create the Outlines
            Core `Guide` instances that will be used to bias the logits
        tensor_library_name: str
            The tensor library name to use for the logits processor.

        """
        self.index = index
        self.tensor_library_name = tensor_library_name
        self.is_first_token = True
        super().__init__(tensor_library_name)

    def reset(self) -> None:
        """Reset the logits processor."""
        self.is_first_token = True

    def _setup(self, batch_size: int, vocab_size: int) -> None:
        """Set the guides, bitmasks and some functions used in the
        `process_logits` method.

        This method is called when the first token is generated instead of
        at initialization because we need to know the batch size and the device
        of the logits.

        Parameters
        ----------
        batch_size: int
            The batch size.
        vocab_size: int
            The vocabulary size.

        """
        if self.tensor_library_name == "torch":
            from outlines_core.kernels.torch import allocate_token_bitmask

            self.allocate_token_bitmask = allocate_token_bitmask
            self.bias_logits = self._bias_logits_torch

        elif self.tensor_library_name == "numpy":
            from outlines_core.kernels.numpy import allocate_token_bitmask

            self.allocate_token_bitmask = allocate_token_bitmask
            self.bias_logits = self._bias_logits_numpy

        elif self.tensor_library_name == "mlx": # pragma: no cover
            from outlines_core.kernels.mlx import (
                allocate_token_bitmask
            )

            self.allocate_token_bitmask = allocate_token_bitmask
            self.bias_logits = self._bias_logits_mlx

        else: # pragma: no cover
            raise ValueError(
                f"Unsupported tensor library: {self.tensor_library_name}"
            )

        self._guides = [Guide(self.index) for _ in range(batch_size)]
        self._bitmasks = [
            self.allocate_token_bitmask(vocab_size)
            for _ in range(batch_size)
        ]

    def _bias_logits_mlx( # pragma: no cover
        self, batch_size: int, logits: TensorType
    ) -> TensorType:
        """Bias the logits for MLX tensors."""
        from outlines_core.kernels.mlx import (
            apply_token_bitmask,
            fill_next_token_bitmask
        )

        biased_logits_array = []
        for i in range(batch_size):
            fill_next_token_bitmask(self._guides[i], self._bitmasks[i])
            biased_logits = apply_token_bitmask(
                self.tensor_adapter.unsqueeze(logits[i]), self._bitmasks[i] # type: ignore
            )
            biased_logits_array.append(biased_logits)

        return self.tensor_adapter.concatenate(biased_logits_array)

    def _bias_logits_torch(
        self, batch_size: int, logits: TensorType
    ) -> TensorType:
        """Bias the logits for Torch tensors."""
        from outlines_core.kernels.torch import (
            apply_token_bitmask_inplace,
            fill_next_token_bitmask
        )

        for i in range(batch_size):
            fill_next_token_bitmask(self._guides[i], self._bitmasks[i])
            self._bitmasks[i] = self.tensor_adapter.to_device(
                self._bitmasks[i],
                self.tensor_adapter.get_device(logits)
            )
            apply_token_bitmask_inplace(
                self.tensor_adapter.unsqueeze(logits[i]), # type: ignore
                self._bitmasks[i]
            )
            self._bitmasks[i] = self.tensor_adapter.to_device(
                self._bitmasks[i],
                "cpu"
            )

        return logits

    def _bias_logits_numpy(
        self, batch_size: int, logits: TensorType
    ) -> TensorType:
        """Bias the logits for Numpy tensors."""
        from outlines_core.kernels.numpy import (
            apply_token_bitmask_inplace,
            fill_next_token_bitmask
        )

        for i in range(batch_size):
            fill_next_token_bitmask(self._guides[i], self._bitmasks[i])
            apply_token_bitmask_inplace(
                self.tensor_adapter.unsqueeze(logits[i]), # type: ignore
                self._bitmasks[i]
            )

        return logits

    def process_logits(
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Use the guides to bias the logits.

        Parameters
        ----------
        input_ids
            The ids of the tokens of the existing sequences.
        logits
            The logits for the current generation step.

        Returns
        -------
        TensorType
            The biased logits.

        """
        batch_size = self.tensor_adapter.shape(input_ids)[0]
        vocab_size = self.tensor_adapter.shape(logits)[1]

        if self.is_first_token:
            self._setup(batch_size, vocab_size)
            self.is_first_token = False
        else:
            for i in range(batch_size):
                last_token_id = self.tensor_adapter.to_scalar(input_ids[i][-1]) # type: ignore
                # This circumvents issue #227 in outlines_core
                # Ideally, we would be able to advance all the times as the final
                # state would accept the eos token leading to itself
                if (
                    not self._guides[i].is_finished()
                    or self._guides[i].accepts_tokens([last_token_id])
                ):
                    self._guides[i].advance(
                        token_id=last_token_id,
                        return_tokens=False
                    )

        return self.bias_logits(batch_size, logits)
__init__(index, tensor_library_name)

Parameters:

Name Type Description Default
index Index

The Outlines Core Index instance to use to create the Outlines Core Guide instances that will be used to bias the logits

required
tensor_library_name str

The tensor library name to use for the logits processor.

required
Source code in outlines/backends/outlines_core.py
def __init__(
    self, index: Index, tensor_library_name: str
):
    """
    Parameters
    ----------
    index: Index
        The Outlines Core `Index` instance to use to create the Outlines
        Core `Guide` instances that will be used to bias the logits
    tensor_library_name: str
        The tensor library name to use for the logits processor.

    """
    self.index = index
    self.tensor_library_name = tensor_library_name
    self.is_first_token = True
    super().__init__(tensor_library_name)
process_logits(input_ids, logits)

Use the guides to bias the logits.

Parameters:

Name Type Description Default
input_ids TensorType

The ids of the tokens of the existing sequences.

required
logits TensorType

The logits for the current generation step.

required

Returns:

Type Description
TensorType

The biased logits.

Source code in outlines/backends/outlines_core.py
def process_logits(
    self, input_ids: TensorType, logits: TensorType
) -> TensorType:
    """Use the guides to bias the logits.

    Parameters
    ----------
    input_ids
        The ids of the tokens of the existing sequences.
    logits
        The logits for the current generation step.

    Returns
    -------
    TensorType
        The biased logits.

    """
    batch_size = self.tensor_adapter.shape(input_ids)[0]
    vocab_size = self.tensor_adapter.shape(logits)[1]

    if self.is_first_token:
        self._setup(batch_size, vocab_size)
        self.is_first_token = False
    else:
        for i in range(batch_size):
            last_token_id = self.tensor_adapter.to_scalar(input_ids[i][-1]) # type: ignore
            # This circumvents issue #227 in outlines_core
            # Ideally, we would be able to advance all the times as the final
            # state would accept the eos token leading to itself
            if (
                not self._guides[i].is_finished()
                or self._guides[i].accepts_tokens([last_token_id])
            ):
                self._guides[i].advance(
                    token_id=last_token_id,
                    return_tokens=False
                )

    return self.bias_logits(batch_size, logits)
reset()

Reset the logits processor.

Source code in outlines/backends/outlines_core.py
def reset(self) -> None:
    """Reset the logits processor."""
    self.is_first_token = True

xgrammar

Backend class for XGrammar.

XGrammarBackend

Bases: BaseBackend

Backend for XGrammar.

Source code in outlines/backends/xgrammar.py
class XGrammarBackend(BaseBackend):
    """Backend for XGrammar."""

    def __init__(self, model: SteerableModel):
        """
        Parameters
        ----------
        model
            The Outlines model of the user.

        """
        import xgrammar as xgr

        if isinstance(model, Transformers):
            tokenizer = model.hf_tokenizer
        elif isinstance(model, MLXLM): # pragma: no cover
            tokenizer = model.mlx_tokenizer._tokenizer
        else: # pragma: no cover
            raise ValueError(
                "The xgrammar backend only supports Transformers and "
                + "MLXLM models"
            )

        tokenizer_info = xgr.TokenizerInfo.from_huggingface(
            tokenizer,
            vocab_size=len(tokenizer.get_vocab())
        )
        self.grammar_compiler = xgr.GrammarCompiler(tokenizer_info)
        self.tensor_library_name = model.tensor_library_name

    def get_json_schema_logits_processor(
        self, json_schema: str
    ) -> XGrammarLogitsProcessor:
        """Create a logits processor from a JSON schema.

        Parameters
        ----------
        json_schema: str
            The JSON schema to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        compiled_grammar = self.grammar_compiler.compile_json_schema(
            json_schema
        )
        return XGrammarLogitsProcessor(
            compiled_grammar,
            self.tensor_library_name
        )

    def get_regex_logits_processor(
        self, regex: str
    ) -> XGrammarLogitsProcessor:
        """Create a logits processor from a regex.

        Parameters
        ----------
        regex: str
            The regex to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        compiled_grammar = self.grammar_compiler.compile_regex(regex)
        return XGrammarLogitsProcessor(
            compiled_grammar,
            self.tensor_library_name
        )

    def get_cfg_logits_processor(
        self, grammar: str
    ) -> XGrammarLogitsProcessor:
        """Create a logits processor from a context-free grammar.

        Parameters
        ----------
        grammar: str
            The context-free grammar to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        compiled_grammar = self.grammar_compiler.compile_grammar(grammar)
        return XGrammarLogitsProcessor(
            compiled_grammar,
            self.tensor_library_name
        )
__init__(model)

Parameters:

Name Type Description Default
model SteerableModel

The Outlines model of the user.

required
Source code in outlines/backends/xgrammar.py
def __init__(self, model: SteerableModel):
    """
    Parameters
    ----------
    model
        The Outlines model of the user.

    """
    import xgrammar as xgr

    if isinstance(model, Transformers):
        tokenizer = model.hf_tokenizer
    elif isinstance(model, MLXLM): # pragma: no cover
        tokenizer = model.mlx_tokenizer._tokenizer
    else: # pragma: no cover
        raise ValueError(
            "The xgrammar backend only supports Transformers and "
            + "MLXLM models"
        )

    tokenizer_info = xgr.TokenizerInfo.from_huggingface(
        tokenizer,
        vocab_size=len(tokenizer.get_vocab())
    )
    self.grammar_compiler = xgr.GrammarCompiler(tokenizer_info)
    self.tensor_library_name = model.tensor_library_name
get_cfg_logits_processor(grammar)

Create a logits processor from a context-free grammar.

Parameters:

Name Type Description Default
grammar str

The context-free grammar to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/xgrammar.py
def get_cfg_logits_processor(
    self, grammar: str
) -> XGrammarLogitsProcessor:
    """Create a logits processor from a context-free grammar.

    Parameters
    ----------
    grammar: str
        The context-free grammar to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    compiled_grammar = self.grammar_compiler.compile_grammar(grammar)
    return XGrammarLogitsProcessor(
        compiled_grammar,
        self.tensor_library_name
    )
get_json_schema_logits_processor(json_schema)

Create a logits processor from a JSON schema.

Parameters:

Name Type Description Default
json_schema str

The JSON schema to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/xgrammar.py
def get_json_schema_logits_processor(
    self, json_schema: str
) -> XGrammarLogitsProcessor:
    """Create a logits processor from a JSON schema.

    Parameters
    ----------
    json_schema: str
        The JSON schema to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    compiled_grammar = self.grammar_compiler.compile_json_schema(
        json_schema
    )
    return XGrammarLogitsProcessor(
        compiled_grammar,
        self.tensor_library_name
    )
get_regex_logits_processor(regex)

Create a logits processor from a regex.

Parameters:

Name Type Description Default
regex str

The regex to create a logits processor from.

required

Returns:

Type Description
LogitsProcessor

The logits processor to use to constrain the generation.

Source code in outlines/backends/xgrammar.py
def get_regex_logits_processor(
    self, regex: str
) -> XGrammarLogitsProcessor:
    """Create a logits processor from a regex.

    Parameters
    ----------
    regex: str
        The regex to create a logits processor from.

    Returns
    -------
    LogitsProcessor
        The logits processor to use to constrain the generation.

    """
    compiled_grammar = self.grammar_compiler.compile_regex(regex)
    return XGrammarLogitsProcessor(
        compiled_grammar,
        self.tensor_library_name
    )

XGrammarLogitsProcessor

Bases: OutlinesLogitsProcessor

Logits processor for XGrammar.

Source code in outlines/backends/xgrammar.py
class XGrammarLogitsProcessor(OutlinesLogitsProcessor):
    """Logits processor for XGrammar."""

    def __init__(self, compiled_grammar: str, tensor_library_name: str,):
        """
        Parameters
        ----------
        compiled_grammar: str
            The compiled grammar to use to create the logits processor.
        tensor_library_name: str
            The name of the tensor library used by the model

        """
        import xgrammar as xgr

        self.xgr = xgr
        self.is_first_token = True
        self.compiled_grammar = compiled_grammar
        self.tensor_library_name = tensor_library_name
        super().__init__(tensor_library_name)

    def reset(self):
        """Ensure self._setup is called again for the next generation."""
        self.is_first_token = True

    def _setup(self, batch_size: int, vocab_size: int) -> None:
        """Setup the logits processor for a new generation."""
        if self.tensor_library_name == "torch":
            self._bias_logits = self._bias_logits_torch
        elif self.tensor_library_name == "mlx": # pragma: no cover
            self._bias_logits = self._bias_logits_mlx
        else: # pragma: no cover
            raise ValueError(
                f"Unsupported tensor library: {self.tensor_library_name}"
            )

        self._matchers = [
            self.xgr.GrammarMatcher(self.compiled_grammar)
            for _ in range(batch_size)
        ]
        self._bitmask = self.xgr.allocate_token_bitmask(batch_size, vocab_size)

    def _bias_logits_torch(
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Bias the logits for Torch tensors."""
        for i in range(self.tensor_adapter.shape(input_ids)[0]):
            if not self._matchers[i].is_terminated():
                self._matchers[i].fill_next_token_bitmask(self._bitmask, i)

        self._bitmask = self.tensor_adapter.to_device(
            self._bitmask,
            self.tensor_adapter.get_device(logits)
        )
        self.xgr.apply_token_bitmask_inplace(logits, self._bitmask)
        self._bitmask = self.tensor_adapter.to_device(
            self._bitmask,
            "cpu"
        )

        return logits

    def _bias_logits_mlx( # pragma: no cover
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Bias the logits for MLX tensors."""
        import mlx.core as mx
        from xgrammar.kernels.apply_token_bitmask_mlx import apply_token_bitmask_mlx

        for i in range(self.tensor_adapter.shape(input_ids)[0]):
            if not self._matchers[i].is_terminated():
                self._matchers[i].fill_next_token_bitmask(self._bitmask, i)

        biased_logits = apply_token_bitmask_mlx(
            mx.array(self._bitmask.numpy()), logits, self.tensor_adapter.shape(logits)[1]
        )

        return biased_logits

    def process_logits(
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Use the XGrammar matchers to bias the logits."""
        batch_size = self.tensor_adapter.shape(input_ids)[0]
        vocab_size = self.tensor_adapter.shape(logits)[1]

        if self.is_first_token:
            self._setup(batch_size, vocab_size)
            self.is_first_token = False
        else:
            for i in range(batch_size):
                if not self._matchers[i].is_terminated(): # pragma: no cover
                    last_token_id = self.tensor_adapter.to_scalar(
                        input_ids[i][-1] # type: ignore
                    )
                    assert self._matchers[i].accept_token(last_token_id)

        return self._bias_logits(input_ids, logits)
__init__(compiled_grammar, tensor_library_name)

Parameters:

Name Type Description Default
compiled_grammar str

The compiled grammar to use to create the logits processor.

required
tensor_library_name str

The name of the tensor library used by the model

required
Source code in outlines/backends/xgrammar.py
def __init__(self, compiled_grammar: str, tensor_library_name: str,):
    """
    Parameters
    ----------
    compiled_grammar: str
        The compiled grammar to use to create the logits processor.
    tensor_library_name: str
        The name of the tensor library used by the model

    """
    import xgrammar as xgr

    self.xgr = xgr
    self.is_first_token = True
    self.compiled_grammar = compiled_grammar
    self.tensor_library_name = tensor_library_name
    super().__init__(tensor_library_name)
process_logits(input_ids, logits)

Use the XGrammar matchers to bias the logits.

Source code in outlines/backends/xgrammar.py
def process_logits(
    self, input_ids: TensorType, logits: TensorType
) -> TensorType:
    """Use the XGrammar matchers to bias the logits."""
    batch_size = self.tensor_adapter.shape(input_ids)[0]
    vocab_size = self.tensor_adapter.shape(logits)[1]

    if self.is_first_token:
        self._setup(batch_size, vocab_size)
        self.is_first_token = False
    else:
        for i in range(batch_size):
            if not self._matchers[i].is_terminated(): # pragma: no cover
                last_token_id = self.tensor_adapter.to_scalar(
                    input_ids[i][-1] # type: ignore
                )
                assert self._matchers[i].accept_token(last_token_id)

    return self._bias_logits(input_ids, logits)
reset()

Ensure self._setup is called again for the next generation.

Source code in outlines/backends/xgrammar.py
def reset(self):
    """Ensure self._setup is called again for the next generation."""
    self.is_first_token = True

caching

Caching and memoization of function calls.

cache(expire=None, typed=False, ignore=())

Caching decorator for memoizing function calls.

The cache key is created based on the values returned by the key_function callable if provided or based on the arguments of the decorated function directly otherwise

This is based on diskcache's memoize.

Parameters:

Name Type Description Default
expire Optional[float]

Seconds until arguments expire.

None
typed

Cache different types separately.

False
ignore

Positional or keyword arguments to ignore.

()

Returns:

Type Description
A decorator function that can be applied to other functions.
Source code in outlines/caching.py
def cache(expire: Optional[float] = None, typed=False, ignore=()):
    """Caching decorator for memoizing function calls.

    The cache key is created based on the values returned by the key_function callable
    if provided or based on the arguments of the decorated function directly otherwise

    This is based on `diskcache`'s `memoize`.

    Parameters
    ----------
    expire
        Seconds until arguments expire.
    typed
        Cache different types separately.
    ignore
        Positional or keyword arguments to ignore.

    Returns
    -------
        A decorator function that can be applied to other functions.
    """

    def decorator(cached_function: Callable):
        memory = get_cache()

        base = (full_name(cached_function),)

        if asyncio.iscoroutinefunction(cached_function):  # pragma: no cover

            async def wrapper(*args, **kwargs):
                if not _caching_enabled:
                    return await cached_function(*args, **kwargs)

                cache_key = wrapper.__cache_key__(*args, **kwargs)
                result = wrapper.__memory__.get(cache_key, default=ENOVAL, retry=True)

                if result is ENOVAL:
                    result = await cached_function(*args, **kwargs)
                    wrapper.__memory__.set(cache_key, result, expire, retry=True)

                return result

        else:

            def wrapper(*args, **kwargs):
                if not _caching_enabled:
                    return cached_function(*args, **kwargs)

                cache_key = wrapper.__cache_key__(*args, **kwargs)
                result = wrapper.__memory__.get(cache_key, default=ENOVAL, retry=True)

                if result is ENOVAL:
                    result = cached_function(*args, **kwargs)
                    wrapper.__memory__.set(cache_key, result, expire, retry=True)

                return result

        def __cache_key__(*args, **kwargs):
            """Make key for cache given function arguments."""
            return args_to_key(base, args, kwargs, typed, ignore)

        wrapper.__cache_key__ = __cache_key__  # type: ignore
        wrapper.__memory__ = memory  # type: ignore
        wrapper.__wrapped__ = cached_function  # type: ignore

        return wrapper

    return decorator

clear_cache()

Erase the cache completely.

Source code in outlines/caching.py
def clear_cache():
    """Erase the cache completely."""
    memory = get_cache()
    memory.clear()

disable_cache()

Disable the cache for this session.

Generative models output different results each time they are called when sampling. This can be a desirable property for some workflows, in which case one can call outlines.call.disable to disable the cache for the session.

This function does not delete the cache, call outlines.cache.clear instead. It also does not overwrite the cache with the values returned during the session.

Example

outlines.cache.disable should be called right after importing outlines:

import outlines.caching as cache cache.disable_cache()

Source code in outlines/caching.py
def disable_cache():
    """Disable the cache for this session.

    Generative models output different results each time they are called when
    sampling. This can be a desirable property for some workflows, in which case
    one can call `outlines.call.disable` to disable the cache for the session.

    This function does not delete the cache, call `outlines.cache.clear`
    instead. It also does not overwrite the cache with the values returned
    during the session.

    Example
    -------

    `outlines.cache.disable` should be called right after importing outlines:

    >>> import outlines.caching as cache
    >>> cache.disable_cache()

    """
    global _caching_enabled
    _caching_enabled = False

get_cache() cached

Get the context object that contains previously-computed return values.

The cache is used to avoid unnecessary computations and API calls, which can be long and expensive for large models.

The cache directory defaults to HOMEDIR/.cache/outlines, but this choice can be overridden by the user by setting the value of the OUTLINES_CACHE_DIR environment variable.

Source code in outlines/caching.py
@functools.lru_cache(1)
def get_cache():
    """Get the context object that contains previously-computed return values.

    The cache is used to avoid unnecessary computations and API calls, which can
    be long and expensive for large models.

    The cache directory defaults to `HOMEDIR/.cache/outlines`, but this choice
    can be overridden by the user by setting the value of the `OUTLINES_CACHE_DIR`
    environment variable.

    """
    from outlines._version import __version__ as outlines_version  # type: ignore

    outlines_cache_dir = os.environ.get("OUTLINES_CACHE_DIR")
    xdg_cache_home = os.environ.get("XDG_CACHE_HOME")
    home_dir = os.path.normpath(os.path.expanduser("~"))
    if outlines_cache_dir:
        # OUTLINES_CACHE_DIR takes precedence
        cache_dir = outlines_cache_dir
    elif xdg_cache_home:  # pragma: no cover
        cache_dir = os.path.join(xdg_cache_home, ".cache", "outlines")
    elif home_dir != "/": # pragma: no cover
        cache_dir = os.path.join(home_dir, ".cache", "outlines")
    else:  # pragma: no cover
        # home_dir may be / inside a docker container without existing user
        tempdir = tempfile.gettempdir()
        cache_dir = os.path.join(tempdir, ".cache", "outlines")

    memory = Cache(
        cache_dir,
        eviction_policy="none",
        cull_limit=0,
        disk=CloudpickleDisk,
    )

    # ensure if version upgrade occurs, old cache is pruned
    if outlines_version != memory.get("__version__"):
        memory.clear()
    memory["__version__"] = outlines_version

    return memory

generator

Encapsulate a model and an output type into a reusable object.

AsyncBlackBoxGenerator

Asynchronous generator for which we don't control constrained generation.

The output type provided is not compiled into a logits processor, but is instead directly passed on to the model.

Source code in outlines/generator.py
class AsyncBlackBoxGenerator:
    """Asynchronous generator for which we don't control constrained
    generation.

    The output type provided is not compiled into a logits processor, but is
    instead directly passed on to the model.

    """
    output_type: Optional[Any]

    def __init__(self, model: AsyncBlackBoxModel, output_type: Optional[Any]):
        """
        Parameters
        ----------
        model
            An instance of an Outlines model.
        output_type
            The output type that will be used to constrain the generation.

        """
        self.model = model
        self.output_type = output_type

    async def __call__(self, prompt: Any, **inference_kwargs) -> Any:
        """Generate a response from the model.

        Parameters
        ----------
        prompt
            The prompt to use to generate a response.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        return await self.model.generate(
            prompt, self.output_type, **inference_kwargs
        )

    async def batch(self, prompts: List[Any], **inference_kwargs) -> List[Any]:
        """Generate a batch of responses from the model.

        Parameters
        ----------
        prompts
            The list of prompts to use to generate a batch of responses.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        List[Any]
            The list of responses generated by the model.

        """
        return await self.model.generate_batch(
            prompts, self.output_type, **inference_kwargs
        )

    async def stream(self, prompt: Any, **inference_kwargs) -> AsyncIterator[Any]:
        """Generate a stream of responses from the model.

        Parameters
        ----------
        prompt
            The prompt to use to generate a response.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        async for chunk in self.model.generate_stream(  # pragma: no cover
            prompt, self.output_type, **inference_kwargs
        ):
            yield chunk

__call__(prompt, **inference_kwargs) async

Generate a response from the model.

Parameters:

Name Type Description Default
prompt Any

The prompt to use to generate a response.

required
**inference_kwargs

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
Any

The response generated by the model.

Source code in outlines/generator.py
async def __call__(self, prompt: Any, **inference_kwargs) -> Any:
    """Generate a response from the model.

    Parameters
    ----------
    prompt
        The prompt to use to generate a response.
    **inference_kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    Any
        The response generated by the model.

    """
    return await self.model.generate(
        prompt, self.output_type, **inference_kwargs
    )

__init__(model, output_type)

Parameters:

Name Type Description Default
model AsyncBlackBoxModel

An instance of an Outlines model.

required
output_type Optional[Any]

The output type that will be used to constrain the generation.

required
Source code in outlines/generator.py
def __init__(self, model: AsyncBlackBoxModel, output_type: Optional[Any]):
    """
    Parameters
    ----------
    model
        An instance of an Outlines model.
    output_type
        The output type that will be used to constrain the generation.

    """
    self.model = model
    self.output_type = output_type

batch(prompts, **inference_kwargs) async

Generate a batch of responses from the model.

Parameters:

Name Type Description Default
prompts List[Any]

The list of prompts to use to generate a batch of responses.

required
**inference_kwargs

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
List[Any]

The list of responses generated by the model.

Source code in outlines/generator.py
async def batch(self, prompts: List[Any], **inference_kwargs) -> List[Any]:
    """Generate a batch of responses from the model.

    Parameters
    ----------
    prompts
        The list of prompts to use to generate a batch of responses.
    **inference_kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    List[Any]
        The list of responses generated by the model.

    """
    return await self.model.generate_batch(
        prompts, self.output_type, **inference_kwargs
    )

stream(prompt, **inference_kwargs) async

Generate a stream of responses from the model.

Parameters:

Name Type Description Default
prompt Any

The prompt to use to generate a response.

required
**inference_kwargs

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
Any

The response generated by the model.

Source code in outlines/generator.py
async def stream(self, prompt: Any, **inference_kwargs) -> AsyncIterator[Any]:
    """Generate a stream of responses from the model.

    Parameters
    ----------
    prompt
        The prompt to use to generate a response.
    **inference_kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    Any
        The response generated by the model.

    """
    async for chunk in self.model.generate_stream(  # pragma: no cover
        prompt, self.output_type, **inference_kwargs
    ):
        yield chunk

BlackBoxGenerator

Synchronous generator for which we don't control constrained generation.

The output type provided is not compiled into a logits processor, but is instead directly passed on to the model.

Source code in outlines/generator.py
class BlackBoxGenerator:
    """Synchronous generator for which we don't control constrained
    generation.

    The output type provided is not compiled into a logits processor, but is
    instead directly passed on to the model.

    """
    output_type: Optional[Any]

    def __init__(self, model: BlackBoxModel, output_type: Optional[Any]):
        """
        Parameters
        ----------
        model
            An instance of an Outlines model.
        output_type
            The output type that will be used to constrain the generation.

        """
        self.model = model
        self.output_type = output_type

    def __call__(self, prompt: Any, **inference_kwargs) -> Any:
        """Generate a response from the model.

        Parameters
        ----------
        prompt
            The prompt to use to generate a response.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        return self.model.generate(
            prompt, self.output_type, **inference_kwargs
        )

    def batch(self, prompts: List[Any], **inference_kwargs) -> List[Any]:
        """Generate a batch of responses from the model.

        Parameters
        ----------
        prompts
            The list of prompts to use to generate a batch of responses.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        List[Any]
            The list of responses generated by the model.

        """
        return self.model.generate_batch(
            prompts, self.output_type, **inference_kwargs
        )

    def stream(self, prompt: Any, **inference_kwargs) -> Iterator[Any]:
        """Generate a stream of responses from the model.

        Parameters
        ----------
        prompt
            The prompt to use to generate a response.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        return self.model.generate_stream(
            prompt, self.output_type, **inference_kwargs
        )

__call__(prompt, **inference_kwargs)

Generate a response from the model.

Parameters:

Name Type Description Default
prompt Any

The prompt to use to generate a response.

required
**inference_kwargs

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
Any

The response generated by the model.

Source code in outlines/generator.py
def __call__(self, prompt: Any, **inference_kwargs) -> Any:
    """Generate a response from the model.

    Parameters
    ----------
    prompt
        The prompt to use to generate a response.
    **inference_kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    Any
        The response generated by the model.

    """
    return self.model.generate(
        prompt, self.output_type, **inference_kwargs
    )

__init__(model, output_type)

Parameters:

Name Type Description Default
model BlackBoxModel

An instance of an Outlines model.

required
output_type Optional[Any]

The output type that will be used to constrain the generation.

required
Source code in outlines/generator.py
def __init__(self, model: BlackBoxModel, output_type: Optional[Any]):
    """
    Parameters
    ----------
    model
        An instance of an Outlines model.
    output_type
        The output type that will be used to constrain the generation.

    """
    self.model = model
    self.output_type = output_type

batch(prompts, **inference_kwargs)

Generate a batch of responses from the model.

Parameters:

Name Type Description Default
prompts List[Any]

The list of prompts to use to generate a batch of responses.

required
**inference_kwargs

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
List[Any]

The list of responses generated by the model.

Source code in outlines/generator.py
def batch(self, prompts: List[Any], **inference_kwargs) -> List[Any]:
    """Generate a batch of responses from the model.

    Parameters
    ----------
    prompts
        The list of prompts to use to generate a batch of responses.
    **inference_kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    List[Any]
        The list of responses generated by the model.

    """
    return self.model.generate_batch(
        prompts, self.output_type, **inference_kwargs
    )

stream(prompt, **inference_kwargs)

Generate a stream of responses from the model.

Parameters:

Name Type Description Default
prompt Any

The prompt to use to generate a response.

required
**inference_kwargs

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
Any

The response generated by the model.

Source code in outlines/generator.py
def stream(self, prompt: Any, **inference_kwargs) -> Iterator[Any]:
    """Generate a stream of responses from the model.

    Parameters
    ----------
    prompt
        The prompt to use to generate a response.
    **inference_kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    Any
        The response generated by the model.

    """
    return self.model.generate_stream(
        prompt, self.output_type, **inference_kwargs
    )

SteerableGenerator

Represents a generator for which we control constrained generation.

The generator is responsible for building and storing the logits processor (which can be quite expensive to build), and then passing it to the model when the generator is called.

The argument defining constrained generation can be of 2 types associated to different methods to create an instance of the generator: - output_type (through __init__): an output type as defined in the outlines.types module - processor (through from_processor): an already built logits processor as defined in the outlines.processors module

The 2 parameters are mutually exclusive.

Source code in outlines/generator.py
class SteerableGenerator:
    """Represents a generator for which we control constrained generation.

    The generator is responsible for building and storing the logits processor
    (which can be quite expensive to build), and then passing it to the model
    when the generator is called.

    The argument defining constrained generation can be of 2 types associated
    to different methods to create an instance of the generator:
    - `output_type` (through `__init__`): an output type as defined in the
      `outlines.types` module
    - `processor` (through `from_processor`): an already built logits processor
       as defined in the `outlines.processors` module

    The 2 parameters are mutually exclusive.

    """
    logits_processor: Optional[LogitsProcessorType]

    def __init__(
        self,
        model: SteerableModel,
        output_type: Optional[Any],
        backend_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        model
            An instance of an Outlines model.
        output_type
            The output type expressed as a Python type
        backend_name
            The name of the backend to use to create the logits processor.

        """
        self.model = model
        if output_type is None:
            self.logits_processor = None
        else:
            term = python_types_to_terms(output_type)
            if isinstance(term, CFG):
                cfg_string = term.definition
                self.logits_processor = get_cfg_logits_processor(
                    backend_name,
                    model,
                    cfg_string,
                )
            elif isinstance(term, JsonSchema):
                self.logits_processor = get_json_schema_logits_processor(
                    backend_name,
                    model,
                    term.schema,
                )
            else:
                regex_string = to_regex(term)
                self.logits_processor = get_regex_logits_processor(
                    backend_name,
                    model,
                    regex_string,
                )

    @classmethod
    def from_processor(
        cls, model: SteerableModel, processor: LogitsProcessorType
    ):
        """Create a generator from a logits processor.

        Parameters
        ----------
        model
            An instance of an Outlines model.
        processor
            An instance of a logits processor.

        """
        instance = cls.__new__(cls)
        instance.model = model
        instance.logits_processor = processor

        return instance

    def __call__(self, prompt: Any, **inference_kwargs) -> Any:
        """Generate a response from the model.

        Parameters
        ----------
        prompt
            The prompt to use to generate a response.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        if self.logits_processor is not None:
            self.logits_processor.reset()
        return self.model.generate(
            prompt, self.logits_processor, **inference_kwargs
        )

    def batch(self, prompts: List[Any], **inference_kwargs) -> List[Any]:
        """Generate a batch of responses from the model.

        Parameters
        ----------
        prompts
            The list of prompts to use to generate a batch of responses.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        List[Any]
            The list of responses generated by the model.

        """
        if self.logits_processor is not None:
            self.logits_processor.reset()
        return self.model.generate_batch(
            prompts, self.logits_processor, **inference_kwargs
        )

    def stream(self, prompt: Any, **inference_kwargs) -> Iterator[Any]:
        """Generate a stream of responses from the model.

        Parameters
        ----------
        prompt
            The prompt to use to generate a response.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        if self.logits_processor is not None:
            self.logits_processor.reset()
        return self.model.generate_stream(
            prompt, self.logits_processor, **inference_kwargs
        )

__call__(prompt, **inference_kwargs)

Generate a response from the model.

Parameters:

Name Type Description Default
prompt Any

The prompt to use to generate a response.

required
**inference_kwargs

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
Any

The response generated by the model.

Source code in outlines/generator.py
def __call__(self, prompt: Any, **inference_kwargs) -> Any:
    """Generate a response from the model.

    Parameters
    ----------
    prompt
        The prompt to use to generate a response.
    **inference_kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    Any
        The response generated by the model.

    """
    if self.logits_processor is not None:
        self.logits_processor.reset()
    return self.model.generate(
        prompt, self.logits_processor, **inference_kwargs
    )

__init__(model, output_type, backend_name=None)

Parameters:

Name Type Description Default
model SteerableModel

An instance of an Outlines model.

required
output_type Optional[Any]

The output type expressed as a Python type

required
backend_name Optional[str]

The name of the backend to use to create the logits processor.

None
Source code in outlines/generator.py
def __init__(
    self,
    model: SteerableModel,
    output_type: Optional[Any],
    backend_name: Optional[str] = None,
):
    """
    Parameters
    ----------
    model
        An instance of an Outlines model.
    output_type
        The output type expressed as a Python type
    backend_name
        The name of the backend to use to create the logits processor.

    """
    self.model = model
    if output_type is None:
        self.logits_processor = None
    else:
        term = python_types_to_terms(output_type)
        if isinstance(term, CFG):
            cfg_string = term.definition
            self.logits_processor = get_cfg_logits_processor(
                backend_name,
                model,
                cfg_string,
            )
        elif isinstance(term, JsonSchema):
            self.logits_processor = get_json_schema_logits_processor(
                backend_name,
                model,
                term.schema,
            )
        else:
            regex_string = to_regex(term)
            self.logits_processor = get_regex_logits_processor(
                backend_name,
                model,
                regex_string,
            )

batch(prompts, **inference_kwargs)

Generate a batch of responses from the model.

Parameters:

Name Type Description Default
prompts List[Any]

The list of prompts to use to generate a batch of responses.

required
**inference_kwargs

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
List[Any]

The list of responses generated by the model.

Source code in outlines/generator.py
def batch(self, prompts: List[Any], **inference_kwargs) -> List[Any]:
    """Generate a batch of responses from the model.

    Parameters
    ----------
    prompts
        The list of prompts to use to generate a batch of responses.
    **inference_kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    List[Any]
        The list of responses generated by the model.

    """
    if self.logits_processor is not None:
        self.logits_processor.reset()
    return self.model.generate_batch(
        prompts, self.logits_processor, **inference_kwargs
    )

from_processor(model, processor) classmethod

Create a generator from a logits processor.

Parameters:

Name Type Description Default
model SteerableModel

An instance of an Outlines model.

required
processor LogitsProcessorType

An instance of a logits processor.

required
Source code in outlines/generator.py
@classmethod
def from_processor(
    cls, model: SteerableModel, processor: LogitsProcessorType
):
    """Create a generator from a logits processor.

    Parameters
    ----------
    model
        An instance of an Outlines model.
    processor
        An instance of a logits processor.

    """
    instance = cls.__new__(cls)
    instance.model = model
    instance.logits_processor = processor

    return instance

stream(prompt, **inference_kwargs)

Generate a stream of responses from the model.

Parameters:

Name Type Description Default
prompt Any

The prompt to use to generate a response.

required
**inference_kwargs

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
Any

The response generated by the model.

Source code in outlines/generator.py
def stream(self, prompt: Any, **inference_kwargs) -> Iterator[Any]:
    """Generate a stream of responses from the model.

    Parameters
    ----------
    prompt
        The prompt to use to generate a response.
    **inference_kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    Any
        The response generated by the model.

    """
    if self.logits_processor is not None:
        self.logits_processor.reset()
    return self.model.generate_stream(
        prompt, self.logits_processor, **inference_kwargs
    )

Generator(model, output_type=None, backend=None, *, processor=None)

Create a generator for the given model and output parameters.

The 2 parameters output_type and processor are mutually exclusive. The parameters processor is only supported for SteerableModel instances (typically local models) and is intended to be only used by advanced users.

Parameters:

Name Type Description Default
model Union[Model, AsyncModel]

An instance of an Outlines model.

required
output_type Optional[Any]

The output type expressed as a Python type or a type defined in the outlines.types.dsl module.

None
backend Optional[str]

The name of the backend to use to create the logits processor. Only used for steerable models if there is an output type and processor is not provided.

None
processor Optional[LogitsProcessorType]

An instance of a logits processor.

None

Returns:

Type Description
Union[SteerableGenerator, BlackBoxGenerator, AsyncBlackBoxGenerator]

A generator instance.

Source code in outlines/generator.py
def Generator(
    model: Union[Model, AsyncModel],
    output_type: Optional[Any] = None,
    backend: Optional[str] = None,
    *,
    processor: Optional[LogitsProcessorType] = None,
) -> Union[SteerableGenerator, BlackBoxGenerator, AsyncBlackBoxGenerator]:
    """Create a generator for the given model and output parameters.

    The 2 parameters output_type and processor are mutually exclusive. The
    parameters processor is only supported for SteerableModel instances
    (typically local models) and is intended to be only used by advanced users.

    Parameters
    ----------
    model
        An instance of an Outlines model.
    output_type
        The output type expressed as a Python type or a type defined in the
        outlines.types.dsl module.
    backend
        The name of the backend to use to create the logits processor. Only
        used for steerable models if there is an output type and `processor` is
        not provided.
    processor
        An instance of a logits processor.

    Returns
    -------
    Union[SteerableGenerator, BlackBoxGenerator, AsyncBlackBoxGenerator]
        A generator instance.

    """
    provided_output_params = sum(
        param is not None
        for param in [output_type, processor]
    )
    if provided_output_params > 1:
        raise ValueError(
            "At most one of output_type or processor can be provided"
        )

    if isinstance(model, SteerableModel): # type: ignore
        if processor is not None:
            return SteerableGenerator.from_processor(model, processor) # type: ignore
        else:
            return SteerableGenerator(model, output_type, backend) # type: ignore
    else:
        if processor is not None:
            raise NotImplementedError(
                "This model does not support logits processors"
            )
        if isinstance(model, AsyncBlackBoxModel): # type: ignore
            return AsyncBlackBoxGenerator(model, output_type) # type: ignore
        elif isinstance(model, BlackBoxModel): # type: ignore
            return BlackBoxGenerator(model, output_type) # type: ignore
        else:
            raise ValueError(
                "The model argument must be an instance of "
                "SteerableModel, BlackBoxModel or AsyncBlackBoxModel"
            )

grammars

A few common Lark grammars.

read_grammar(grammar_file_name, base_grammar_path=GRAMMAR_PATH)

Read grammar file from default grammar path.

Parameters:

Name Type Description Default
grammar_file_name str

The name of the grammar file to read.

required
base_grammar_path Path

The path to the directory containing the grammar file.

GRAMMAR_PATH

Returns:

Type Description
str

The contents of the grammar file.

Source code in outlines/grammars.py
def read_grammar(
    grammar_file_name: str,
    base_grammar_path: Path = GRAMMAR_PATH,
) -> str:
    """Read grammar file from default grammar path.

    Parameters
    ----------
    grammar_file_name
        The name of the grammar file to read.
    base_grammar_path
        The path to the directory containing the grammar file.

    Returns
    -------
    str
        The contents of the grammar file.

    """
    full_path = base_grammar_path / grammar_file_name
    with open(full_path) as file:
        return file.read()

inputs

Contain classes used to define the inputs of a model.

Audio dataclass

Contains an audio that can be passed to a multimodal model.

Provide one or several instances of this class along with a text prompt in a list as the model_input argument to a model that supports audio processing.

Parameters:

Name Type Description Default
audio Any

The audio to use in the text generation.

required
Source code in outlines/inputs.py
@dataclass
class Audio:
    """Contains an audio that can be passed to a multimodal model.

    Provide one or several instances of this class along with a text prompt
    in a list as the `model_input` argument to a model that supports audio
    processing.

    Parameters
    ----------
    audio
        The audio to use in the text generation.

    """
    audio: Any

Chat dataclass

Contains the input for a chat model.

Provide an instance of this class as the model_input argument to a model that supports chat.

Each message contained in the messages list must be a dict with 'role' and 'content' keys. The role can be 'user', 'assistant', or 'system'. The content supports either: - a text string, - a list containing text and assets (e.g., ["Describe...", Image(...)]), - only for HuggingFace transformers models, a list of dict items with explicit types (e.g., [{"type": "text", "text": "Describe..."}, {"type": "image", "image": Image(...)}])

Examples:

# Initialize the chat with a system message.
chat_prompt = Chat([
    {"role": "system", "content": "You are a helpful assistant."},
])

# Add a user message with an image and call the model (not shown here).
chat_prompt.add_user_message(["Describe the image below", Image(image)])

# Add as an assistant message the response from the model.
chat_prompt.add_assistant_message("There is a black cat sitting on a couch.")

Parameters:

Name Type Description Default
messages List[Dict[str, Any]]

The list of messages that will be provided to the model.

None
Source code in outlines/inputs.py
@dataclass
class Chat:
    """Contains the input for a chat model.

    Provide an instance of this class as the `model_input` argument to a model
    that supports chat.

    Each message contained in the messages list must be a dict with 'role' and
    'content' keys. The role can be 'user', 'assistant', or 'system'. The content
    supports either:
    - a text string,
    - a list containing text and assets (e.g., ["Describe...", Image(...)]),
    - only for HuggingFace transformers models, a list of dict items with explicit types (e.g.,
      [{"type": "text", "text": "Describe..."}, {"type": "image", "image": Image(...)}])

    Examples
    --------
    ```python
    # Initialize the chat with a system message.
    chat_prompt = Chat([
        {"role": "system", "content": "You are a helpful assistant."},
    ])

    # Add a user message with an image and call the model (not shown here).
    chat_prompt.add_user_message(["Describe the image below", Image(image)])

    # Add as an assistant message the response from the model.
    chat_prompt.add_assistant_message("There is a black cat sitting on a couch.")
    ```

    Parameters
    ----------
    messages
        The list of messages that will be provided to the model.

    """
    messages: List[Dict[str, Any]] = None # type: ignore

    def __post_init__(self):
        if self.messages is None:
            self.messages = []

    def append(self, message: Dict[str, Any]):
        """Add a message to the chat.

        Parameters
        ----------
        message
            The message to add to the chat.

        """
        self.messages.append(message)

    def extend(self, messages: List[Dict[str, Any]]):
        """Add a list of messages to the chat.

        Parameters
        ----------
        messages
            The list of messages to add to the chat.

        """
        self.messages.extend(messages)

    def pop(self) -> Dict[str, Any]:
        """Remove the last message from the chat.

        Returns
        -------
        message
            The removed message.

        """
        return self.messages.pop()

    def add_system_message(self, content: str | List[Any]):
        """Add a system message to the chat.

        Parameters
        ----------
        content
            The content of the system message.

        """
        self.messages.append({"role": "system", "content": content})

    def add_user_message(self, content: str | List[Any]):
        """Add a user message to the chat.

        Parameters
        ----------
        content
            The content of the user message.

        """
        self.messages.append({"role": "user", "content": content})

    def add_assistant_message(self, content: str | List[Any]):
        """Add an assistant message to the chat.

        Parameters
        ----------
        content
            The content of the assistant message.

        """
        self.messages.append({"role": "assistant", "content": content})

    def __str__(self):
        return "\n".join(str(message) for message in self.messages)

    def __repr__(self):
        return f"Chat(messages={self.messages})"

add_assistant_message(content)

Add an assistant message to the chat.

Parameters:

Name Type Description Default
content str | List[Any]

The content of the assistant message.

required
Source code in outlines/inputs.py
def add_assistant_message(self, content: str | List[Any]):
    """Add an assistant message to the chat.

    Parameters
    ----------
    content
        The content of the assistant message.

    """
    self.messages.append({"role": "assistant", "content": content})

add_system_message(content)

Add a system message to the chat.

Parameters:

Name Type Description Default
content str | List[Any]

The content of the system message.

required
Source code in outlines/inputs.py
def add_system_message(self, content: str | List[Any]):
    """Add a system message to the chat.

    Parameters
    ----------
    content
        The content of the system message.

    """
    self.messages.append({"role": "system", "content": content})

add_user_message(content)

Add a user message to the chat.

Parameters:

Name Type Description Default
content str | List[Any]

The content of the user message.

required
Source code in outlines/inputs.py
def add_user_message(self, content: str | List[Any]):
    """Add a user message to the chat.

    Parameters
    ----------
    content
        The content of the user message.

    """
    self.messages.append({"role": "user", "content": content})

append(message)

Add a message to the chat.

Parameters:

Name Type Description Default
message Dict[str, Any]

The message to add to the chat.

required
Source code in outlines/inputs.py
def append(self, message: Dict[str, Any]):
    """Add a message to the chat.

    Parameters
    ----------
    message
        The message to add to the chat.

    """
    self.messages.append(message)

extend(messages)

Add a list of messages to the chat.

Parameters:

Name Type Description Default
messages List[Dict[str, Any]]

The list of messages to add to the chat.

required
Source code in outlines/inputs.py
def extend(self, messages: List[Dict[str, Any]]):
    """Add a list of messages to the chat.

    Parameters
    ----------
    messages
        The list of messages to add to the chat.

    """
    self.messages.extend(messages)

pop()

Remove the last message from the chat.

Returns:

Type Description
message

The removed message.

Source code in outlines/inputs.py
def pop(self) -> Dict[str, Any]:
    """Remove the last message from the chat.

    Returns
    -------
    message
        The removed message.

    """
    return self.messages.pop()

Image dataclass

Contains an image that can be passed to a multimodal model.

Provide one or several instances of this class along with a text prompt in a list as the model_input argument to a model that supports vision.

Parameters:

Name Type Description Default
image Image

The image to use in the text generation.

required
Source code in outlines/inputs.py
@dataclass
class Image:
    """Contains an image that can be passed to a multimodal model.

    Provide one or several instances of this class along with a text prompt
    in a list as the `model_input` argument to a model that supports vision.

    Parameters
    ----------
    image
        The image to use in the text generation.

    """
    image: PILImage.Image

    def __post_init__(self):
        image = self.image

        if not image.format:
            raise TypeError(
                "Could not read the format of the image passed to the model."
            )

        buffer = BytesIO()
        image.save(buffer, format=image.format)
        self.image_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
        self.image_format = f"image/{image.format.lower()}"

Video dataclass

Contains a video that can be passed to a multimodal model.

Provide one or several instances of this class along with a text prompt in a list as the model_input argument to a model that supports video processing.

Parameters:

Name Type Description Default
video Any

The video to use in the text generation.

required
Source code in outlines/inputs.py
@dataclass
class Video:
    """Contains a video that can be passed to a multimodal model.

    Provide one or several instances of this class along with a text prompt
    in a list as the `model_input` argument to a model that supports video
    processing.

    Parameters
    ----------
    video
        The video to use in the text generation.

    """
    video: Any

models

Module that contains all the models integrated in outlines.

We group the models in submodules by provider instead of theme (completion, chat completion, diffusers, etc.) and use routing functions everywhere else in the codebase.

Anthropic

Bases: Model

Thin wrapper around the anthropic.Anthropic client.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the anthropic.Anthropic client.

Source code in outlines/models/anthropic.py
class Anthropic(Model):
    """Thin wrapper around the `anthropic.Anthropic` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `anthropic.Anthropic` client.

    """
    def __init__(
        self, client: "AnthropicClient", model_name: Optional[str] = None
    ):
        """
        Parameters
        ----------
        client
            An `anthropic.Anthropic` client.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = AnthropicTypeAdapter()

    def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> str:
        """Generate text using Anthropic.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            As structured generation is not supported by Anthropic, the value
            of this argument must be `None`. Otherwise, an error will be
            raised at runtime.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The response generated by the model.

        """
        messages = self.type_adapter.format_input(model_input)

        if output_type is not None:
            raise NotImplementedError(
                f"The type {output_type} is not available with Anthropic."
            )

        if (
            "model" not in inference_kwargs
            and self.model_name is not None
        ):
            inference_kwargs["model"] = self.model_name

        completion = self.client.messages.create(
            **messages,
            **inference_kwargs,
        )
        return completion.content[0].text

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "Anthropic does not support batch generation."
        )

    def generate_stream(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using Anthropic.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            As structured generation is not supported by Anthropic, the value
            of this argument must be `None`. Otherwise, an error will be
            raised at runtime.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        messages = self.type_adapter.format_input(model_input)

        if output_type is not None:
            raise NotImplementedError(
                f"The type {output_type} is not available with Anthropic."
            )

        if (
            "model" not in inference_kwargs
            and self.model_name is not None
        ):
            inference_kwargs["model"] = self.model_name

        stream = self.client.messages.create(
            **messages,
            stream=True,
            **inference_kwargs,
        )

        for chunk in stream:
            if (
                chunk.type == "content_block_delta"
                and chunk.delta.type == "text_delta"
            ):
                yield chunk.delta.text

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client Anthropic

An anthropic.Anthropic client.

required
model_name Optional[str]

The name of the model to use.

None
Source code in outlines/models/anthropic.py
def __init__(
    self, client: "AnthropicClient", model_name: Optional[str] = None
):
    """
    Parameters
    ----------
    client
        An `anthropic.Anthropic` client.
    model_name
        The name of the model to use.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = AnthropicTypeAdapter()

generate(model_input, output_type=None, **inference_kwargs)

Generate text using Anthropic.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

As structured generation is not supported by Anthropic, the value of this argument must be None. Otherwise, an error will be raised at runtime.

None
**inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
str

The response generated by the model.

Source code in outlines/models/anthropic.py
def generate(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> str:
    """Generate text using Anthropic.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        As structured generation is not supported by Anthropic, the value
        of this argument must be `None`. Otherwise, an error will be
        raised at runtime.
    **inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    str
        The response generated by the model.

    """
    messages = self.type_adapter.format_input(model_input)

    if output_type is not None:
        raise NotImplementedError(
            f"The type {output_type} is not available with Anthropic."
        )

    if (
        "model" not in inference_kwargs
        and self.model_name is not None
    ):
        inference_kwargs["model"] = self.model_name

    completion = self.client.messages.create(
        **messages,
        **inference_kwargs,
    )
    return completion.content[0].text

generate_stream(model_input, output_type=None, **inference_kwargs)

Stream text using Anthropic.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

As structured generation is not supported by Anthropic, the value of this argument must be None. Otherwise, an error will be raised at runtime.

None
**inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/anthropic.py
def generate_stream(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Iterator[str]:
    """Stream text using Anthropic.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        As structured generation is not supported by Anthropic, the value
        of this argument must be `None`. Otherwise, an error will be
        raised at runtime.
    **inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    messages = self.type_adapter.format_input(model_input)

    if output_type is not None:
        raise NotImplementedError(
            f"The type {output_type} is not available with Anthropic."
        )

    if (
        "model" not in inference_kwargs
        and self.model_name is not None
    ):
        inference_kwargs["model"] = self.model_name

    stream = self.client.messages.create(
        **messages,
        stream=True,
        **inference_kwargs,
    )

    for chunk in stream:
        if (
            chunk.type == "content_block_delta"
            and chunk.delta.type == "text_delta"
        ):
            yield chunk.delta.text

AsyncLMStudio

Bases: AsyncModel

Thin wrapper around a lmstudio.AsyncClient client.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the LMStudio async client.

Source code in outlines/models/lmstudio.py
class AsyncLMStudio(AsyncModel):
    """Thin wrapper around a `lmstudio.AsyncClient` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the LMStudio async client.

    """

    def __init__(
        self, client: "AsyncClient", model_name: Optional[str] = None
    ):
        """
        Parameters
        ----------
        client
            A LMStudio AsyncClient instance.
        model_name
            The name of the model to use. If not provided, uses the default
            loaded model in LMStudio.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = LMStudioTypeAdapter()
        self._context_entered = False

    async def close(self) -> None:
        """Close the async client and release resources."""
        if self._context_entered:
            await self.client.__aexit__(None, None, None)
            self._context_entered = False

    async def generate(
        self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> str:
        """Generate text using LMStudio asynchronously.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        str
            The text generated by the model.

        """
        if not self._context_entered:
            await self.client.__aenter__()
            self._context_entered = True

        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        model_key = kwargs.pop("model", None)
        model = await self.client.llm.model(model_key) if model_key else await self.client.llm.model()

        formatted_input = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if response_format is not None:
            kwargs["response_format"] = response_format

        result = await model.respond(formatted_input, **kwargs)
        return result.content

    async def generate_batch(
        self,
        model_input,
        output_type=None,
        **kwargs,
    ):
        raise NotImplementedError(
            "The `lmstudio` library does not support batch inference."
        )

    async def generate_stream(  # type: ignore
        self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> AsyncIterator[str]:
        """Stream text using LMStudio asynchronously.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        AsyncIterator[str]
            An async iterator that yields the text generated by the model.

        """
        if not self._context_entered:
            await self.client.__aenter__()
            self._context_entered = True

        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        model_key = kwargs.pop("model", None)
        model = await self.client.llm.model(model_key) if model_key else await self.client.llm.model()

        formatted_input = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if response_format is not None:
            kwargs["response_format"] = response_format

        stream = await model.respond_stream(formatted_input, **kwargs)
        async for fragment in stream:
            yield fragment.content

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client AsyncClient

A LMStudio AsyncClient instance.

required
model_name Optional[str]

The name of the model to use. If not provided, uses the default loaded model in LMStudio.

None
Source code in outlines/models/lmstudio.py
def __init__(
    self, client: "AsyncClient", model_name: Optional[str] = None
):
    """
    Parameters
    ----------
    client
        A LMStudio AsyncClient instance.
    model_name
        The name of the model to use. If not provided, uses the default
        loaded model in LMStudio.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = LMStudioTypeAdapter()
    self._context_entered = False

close() async

Close the async client and release resources.

Source code in outlines/models/lmstudio.py
async def close(self) -> None:
    """Close the async client and release resources."""
    if self._context_entered:
        await self.client.__aexit__(None, None, None)
        self._context_entered = False

generate(model_input, output_type=None, **kwargs) async

Generate text using LMStudio asynchronously.

Parameters:

Name Type Description Default
model_input Chat | str | list

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema.

None
**kwargs Any

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
str

The text generated by the model.

Source code in outlines/models/lmstudio.py
async def generate(
    self,
    model_input: Chat | str | list,
    output_type: Optional[Any] = None,
    **kwargs: Any,
) -> str:
    """Generate text using LMStudio asynchronously.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema.
    **kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    str
        The text generated by the model.

    """
    if not self._context_entered:
        await self.client.__aenter__()
        self._context_entered = True

    if "model" not in kwargs and self.model_name is not None:
        kwargs["model"] = self.model_name

    model_key = kwargs.pop("model", None)
    model = await self.client.llm.model(model_key) if model_key else await self.client.llm.model()

    formatted_input = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if response_format is not None:
        kwargs["response_format"] = response_format

    result = await model.respond(formatted_input, **kwargs)
    return result.content

generate_stream(model_input, output_type=None, **kwargs) async

Stream text using LMStudio asynchronously.

Parameters:

Name Type Description Default
model_input Chat | str | list

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema.

None
**kwargs Any

Additional keyword arguments to pass to the model.

{}

Returns:

Type Description
AsyncIterator[str]

An async iterator that yields the text generated by the model.

Source code in outlines/models/lmstudio.py
async def generate_stream(  # type: ignore
    self,
    model_input: Chat | str | list,
    output_type: Optional[Any] = None,
    **kwargs: Any,
) -> AsyncIterator[str]:
    """Stream text using LMStudio asynchronously.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema.
    **kwargs
        Additional keyword arguments to pass to the model.

    Returns
    -------
    AsyncIterator[str]
        An async iterator that yields the text generated by the model.

    """
    if not self._context_entered:
        await self.client.__aenter__()
        self._context_entered = True

    if "model" not in kwargs and self.model_name is not None:
        kwargs["model"] = self.model_name

    model_key = kwargs.pop("model", None)
    model = await self.client.llm.model(model_key) if model_key else await self.client.llm.model()

    formatted_input = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if response_format is not None:
        kwargs["response_format"] = response_format

    stream = await model.respond_stream(formatted_input, **kwargs)
    async for fragment in stream:
        yield fragment.content

AsyncMistral

Bases: AsyncModel

Async thin wrapper around the mistralai.Mistral client.

Converts input and output types to arguments for the mistralai.Mistral client's async methods (chat.complete_async or chat.stream_async).

Source code in outlines/models/mistral.py
class AsyncMistral(AsyncModel):
    """Async thin wrapper around the `mistralai.Mistral` client.

    Converts input and output types to arguments for the `mistralai.Mistral`
    client's async methods (`chat.complete_async` or `chat.stream_async`).

    """

    def __init__(
        self, client: "MistralClient", model_name: Optional[str] = None
    ):
        """
        Parameters
        ----------
        client : MistralClient
            A mistralai.Mistral client instance.
        model_name : Optional[str]
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = MistralTypeAdapter()

    async def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate a response from the model asynchronously.

        Parameters
        ----------
        model_input : Union[Chat, list, str]
            The prompt or chat messages to generate a response from.
        output_type : Optional[Any]
            The desired format of the response (e.g., JSON schema).
        **inference_kwargs : Any
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The response generated by the model as text.

        """
        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            result = await self.client.chat.complete_async(
                messages=messages,
                response_format=response_format,
                stream=False,
                **inference_kwargs,
            )
        except Exception as e:
            if "schema" in str(e).lower() or "json_schema" in str(e).lower():
                raise TypeError(
                    f"Mistral does not support your schema: {e}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise RuntimeError(f"Mistral API error: {e}") from e

        outputs = [choice.message for choice in result.choices]

        if len(outputs) == 1:
            return outputs[0].content
        else:
            return [m.content for m in outputs]

    async def generate_batch(
        self,
        model_input,
        output_type=None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "The mistralai library does not support batch inference."
        )

    async def generate_stream(
        self,
        model_input,
        output_type=None,
        **inference_kwargs,
    ):
        """Generate text from the model as an async stream of chunks.

        Parameters
        ----------
        model_input
            str, list, or chat input to generate from.
        output_type
            Optional type for structured output.
        **inference_kwargs
            Extra kwargs like "model" name.

        Yields
        ------
        str
            Chunks of text as they are streamed.

        """
        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            response = await self.client.chat.stream_async(
                messages=messages,
                response_format=response_format,
                **inference_kwargs
            )
        except Exception as e:
            if "schema" in str(e).lower() or "json_schema" in str(e).lower():
                raise TypeError(
                    f"Mistral does not support your schema: {e}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise RuntimeError(f"Mistral API error: {e}") from e

        async for chunk in response:
            if (
                hasattr(chunk, "data")
                and chunk.data.choices
                and len(chunk.data.choices) > 0
                and hasattr(chunk.data.choices[0], "delta")
                and chunk.data.choices[0].delta.content is not None
            ):
                yield chunk.data.choices[0].delta.content

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client Mistral

A mistralai.Mistral client instance.

required
model_name Optional[str]

The name of the model to use.

None
Source code in outlines/models/mistral.py
def __init__(
    self, client: "MistralClient", model_name: Optional[str] = None
):
    """
    Parameters
    ----------
    client : MistralClient
        A mistralai.Mistral client instance.
    model_name : Optional[str]
        The name of the model to use.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = MistralTypeAdapter()

generate(model_input, output_type=None, **inference_kwargs) async

Generate a response from the model asynchronously.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt or chat messages to generate a response from.

required
output_type Optional[Any]

The desired format of the response (e.g., JSON schema).

None
**inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Union[str, list[str]]

The response generated by the model as text.

Source code in outlines/models/mistral.py
async def generate(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Union[str, list[str]]:
    """Generate a response from the model asynchronously.

    Parameters
    ----------
    model_input : Union[Chat, list, str]
        The prompt or chat messages to generate a response from.
    output_type : Optional[Any]
        The desired format of the response (e.g., JSON schema).
    **inference_kwargs : Any
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Union[str, list[str]]
        The response generated by the model as text.

    """
    messages = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if "model" not in inference_kwargs and self.model_name is not None:
        inference_kwargs["model"] = self.model_name

    try:
        result = await self.client.chat.complete_async(
            messages=messages,
            response_format=response_format,
            stream=False,
            **inference_kwargs,
        )
    except Exception as e:
        if "schema" in str(e).lower() or "json_schema" in str(e).lower():
            raise TypeError(
                f"Mistral does not support your schema: {e}. "
                "Try a local model or dottxt instead."
            )
        else:
            raise RuntimeError(f"Mistral API error: {e}") from e

    outputs = [choice.message for choice in result.choices]

    if len(outputs) == 1:
        return outputs[0].content
    else:
        return [m.content for m in outputs]

generate_stream(model_input, output_type=None, **inference_kwargs) async

Generate text from the model as an async stream of chunks.

Parameters:

Name Type Description Default
model_input

str, list, or chat input to generate from.

required
output_type

Optional type for structured output.

None
**inference_kwargs

Extra kwargs like "model" name.

{}

Yields:

Type Description
str

Chunks of text as they are streamed.

Source code in outlines/models/mistral.py
async def generate_stream(
    self,
    model_input,
    output_type=None,
    **inference_kwargs,
):
    """Generate text from the model as an async stream of chunks.

    Parameters
    ----------
    model_input
        str, list, or chat input to generate from.
    output_type
        Optional type for structured output.
    **inference_kwargs
        Extra kwargs like "model" name.

    Yields
    ------
    str
        Chunks of text as they are streamed.

    """
    messages = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if "model" not in inference_kwargs and self.model_name is not None:
        inference_kwargs["model"] = self.model_name

    try:
        response = await self.client.chat.stream_async(
            messages=messages,
            response_format=response_format,
            **inference_kwargs
        )
    except Exception as e:
        if "schema" in str(e).lower() or "json_schema" in str(e).lower():
            raise TypeError(
                f"Mistral does not support your schema: {e}. "
                "Try a local model or dottxt instead."
            )
        else:
            raise RuntimeError(f"Mistral API error: {e}") from e

    async for chunk in response:
        if (
            hasattr(chunk, "data")
            and chunk.data.choices
            and len(chunk.data.choices) > 0
            and hasattr(chunk.data.choices[0], "delta")
            and chunk.data.choices[0].delta.content is not None
        ):
            yield chunk.data.choices[0].delta.content

AsyncOllama

Bases: AsyncModel

Thin wrapper around the ollama.AsyncClient client.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the ollama.AsyncClient client.

Source code in outlines/models/ollama.py
class AsyncOllama(AsyncModel):
    """Thin wrapper around the `ollama.AsyncClient` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `ollama.AsyncClient` client.

    """

    def __init__(
        self,client: "AsyncClient", model_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            The `ollama.Client` client.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = OllamaTypeAdapter()

    async def generate(self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> str:
        """Generate text using Ollama.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The text generated by the model.

        """
        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        response = await self.client.chat(
            messages=self.type_adapter.format_input(model_input),
            format=self.type_adapter.format_output_type(output_type),
            **kwargs,
        )
        return response.message.content

    async def generate_batch(
        self,
        model_input,
        output_type = None,
        **kwargs,
    ):
        raise NotImplementedError(
            "The `ollama` library does not support batch inference."
        )

    async def generate_stream( # type: ignore
        self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> AsyncIterator[str]:
        """Stream text using Ollama.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        stream = await self.client.chat(
            messages=self.type_adapter.format_input(model_input),
            format=self.type_adapter.format_output_type(output_type),
            stream=True,
            **kwargs,
        )
        async for chunk in stream:
            yield chunk.message.content

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client AsyncClient

The ollama.Client client.

required
model_name Optional[str]

The name of the model to use.

None
Source code in outlines/models/ollama.py
def __init__(
    self,client: "AsyncClient", model_name: Optional[str] = None,
):
    """
    Parameters
    ----------
    client
        The `ollama.Client` client.
    model_name
        The name of the model to use.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = OllamaTypeAdapter()

generate(model_input, output_type=None, **kwargs) async

Generate text using Ollama.

Parameters:

Name Type Description Default
model_input Chat | str | list

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema.

None
**kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
str

The text generated by the model.

Source code in outlines/models/ollama.py
async def generate(self,
    model_input: Chat | str | list,
    output_type: Optional[Any] = None,
    **kwargs: Any,
) -> str:
    """Generate text using Ollama.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema.
    **kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    str
        The text generated by the model.

    """
    if "model" not in kwargs and self.model_name is not None:
        kwargs["model"] = self.model_name

    response = await self.client.chat(
        messages=self.type_adapter.format_input(model_input),
        format=self.type_adapter.format_output_type(output_type),
        **kwargs,
    )
    return response.message.content

generate_stream(model_input, output_type=None, **kwargs) async

Stream text using Ollama.

Parameters:

Name Type Description Default
model_input Chat | str | list

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema.

None
**kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/ollama.py
async def generate_stream( # type: ignore
    self,
    model_input: Chat | str | list,
    output_type: Optional[Any] = None,
    **kwargs: Any,
) -> AsyncIterator[str]:
    """Stream text using Ollama.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema.
    **kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    if "model" not in kwargs and self.model_name is not None:
        kwargs["model"] = self.model_name

    stream = await self.client.chat(
        messages=self.type_adapter.format_input(model_input),
        format=self.type_adapter.format_output_type(output_type),
        stream=True,
        **kwargs,
    )
    async for chunk in stream:
        yield chunk.message.content

AsyncOpenAI

Bases: AsyncModel

Thin wrapper around the openai.AsyncOpenAI client.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the openai.AsyncOpenAI client.

Source code in outlines/models/openai.py
class AsyncOpenAI(AsyncModel):
    """Thin wrapper around the `openai.AsyncOpenAI` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.AsyncOpenAI` client.

    """

    def __init__(
        self,
        client: Union["AsyncOpenAIClient", "AsyncAzureOpenAIClient"],
        model_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            The `openai.AsyncOpenAI` or `openai.AsyncAzureOpenAI` client.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = OpenAITypeAdapter()

    async def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Union[type[BaseModel], str]] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using OpenAI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema or an empty dictionary.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        import openai

        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            result = await self.client.chat.completions.create(
                messages=messages,
                **response_format,
                **inference_kwargs,
            )
        except openai.BadRequestError as e:
            if e.body["message"].startswith("Invalid schema"):
                raise TypeError(
                    f"OpenAI does not support your schema: {e.body['message']}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise e

        messages = [choice.message for choice in result.choices]
        for message in messages:
            if message.refusal is not None:
                raise ValueError(
                    f"OpenAI refused to answer the request: {message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    async def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "The `openai` library does not support batch inference."
        )

    async def generate_stream( # type: ignore
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Union[type[BaseModel], str]] = None,
        **inference_kwargs,
    ) -> AsyncIterator[str]:
        """Stream text using OpenAI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema or an empty dictionary.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        import openai

        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            stream = await self.client.chat.completions.create(
                stream=True,
                messages=messages,
                **response_format,
                **inference_kwargs
            )
        except openai.BadRequestError as e:
            if e.body["message"].startswith("Invalid schema"):
                raise TypeError(
                    f"OpenAI does not support your schema: {e.body['message']}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise e

        async for chunk in stream:
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client Union[AsyncOpenAI, AsyncAzureOpenAI]

The openai.AsyncOpenAI or openai.AsyncAzureOpenAI client.

required
model_name Optional[str]

The name of the model to use.

None
Source code in outlines/models/openai.py
def __init__(
    self,
    client: Union["AsyncOpenAIClient", "AsyncAzureOpenAIClient"],
    model_name: Optional[str] = None,
):
    """
    Parameters
    ----------
    client
        The `openai.AsyncOpenAI` or `openai.AsyncAzureOpenAI` client.
    model_name
        The name of the model to use.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = OpenAITypeAdapter()

generate(model_input, output_type=None, **inference_kwargs) async

Generate text using OpenAI.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt based on which the model will generate a response.

required
output_type Optional[Union[type[BaseModel], str]]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema or an empty dictionary.

None
**inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Union[str, list[str]]

The text generated by the model.

Source code in outlines/models/openai.py
async def generate(
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Union[type[BaseModel], str]] = None,
    **inference_kwargs: Any,
) -> Union[str, list[str]]:
    """Generate text using OpenAI.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema or an empty dictionary.
    **inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Union[str, list[str]]
        The text generated by the model.

    """
    import openai

    messages = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if "model" not in inference_kwargs and self.model_name is not None:
        inference_kwargs["model"] = self.model_name

    try:
        result = await self.client.chat.completions.create(
            messages=messages,
            **response_format,
            **inference_kwargs,
        )
    except openai.BadRequestError as e:
        if e.body["message"].startswith("Invalid schema"):
            raise TypeError(
                f"OpenAI does not support your schema: {e.body['message']}. "
                "Try a local model or dottxt instead."
            )
        else:
            raise e

    messages = [choice.message for choice in result.choices]
    for message in messages:
        if message.refusal is not None:
            raise ValueError(
                f"OpenAI refused to answer the request: {message.refusal}"
            )

    if len(messages) == 1:
        return messages[0].content
    else:
        return [message.content for message in messages]

generate_stream(model_input, output_type=None, **inference_kwargs) async

Stream text using OpenAI.

Parameters:

Name Type Description Default
model_input Union[Chat, list, str]

The prompt based on which the model will generate a response.

required
output_type Optional[Union[type[BaseModel], str]]

The desired format of the response generated by the model. The output type must be of a type that can be converted to a JSON schema or an empty dictionary.

None
**inference_kwargs

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/openai.py
async def generate_stream( # type: ignore
    self,
    model_input: Union[Chat, list, str],
    output_type: Optional[Union[type[BaseModel], str]] = None,
    **inference_kwargs,
) -> AsyncIterator[str]:
    """Stream text using OpenAI.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. The
        output type must be of a type that can be converted to a JSON
        schema or an empty dictionary.
    **inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    import openai

    messages = self.type_adapter.format_input(model_input)
    response_format = self.type_adapter.format_output_type(output_type)

    if "model" not in inference_kwargs and self.model_name is not None:
        inference_kwargs["model"] = self.model_name

    try:
        stream = await self.client.chat.completions.create(
            stream=True,
            messages=messages,
            **response_format,
            **inference_kwargs
        )
    except openai.BadRequestError as e:
        if e.body["message"].startswith("Invalid schema"):
            raise TypeError(
                f"OpenAI does not support your schema: {e.body['message']}. "
                "Try a local model or dottxt instead."
            )
        else:
            raise e

    async for chunk in stream:
        if chunk.choices and chunk.choices[0].delta.content is not None:
            yield chunk.choices[0].delta.content

AsyncSGLang

Bases: AsyncModel

Thin async wrapper around the openai.OpenAI client used to communicate with an SGLang server.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the openai.OpenAI client for the SGLang server.

Source code in outlines/models/sglang.py
class AsyncSGLang(AsyncModel):
    """Thin async wrapper around the `openai.OpenAI` client used to communicate
    with an SGLang server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.OpenAI` client for the
    SGLang server.

    """

    def __init__(self, client, model_name: Optional[str] = None):
        """
        Parameters
        ----------
        client
            An `openai.AsyncOpenAI` client instance.
        model_name
            The name of the model to use.

        Parameters
        ----------
        client
            An `openai.AsyncOpenAI` client instance.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = SGLangTypeAdapter()

    async def generate(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using `sglang`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        response = await self.client.chat.completions.create(**client_args)

        messages = [choice.message for choice in response.choices]
        for message in messages:
            if message.refusal is not None:  # pragma: no cover
                raise ValueError(
                    f"The sglang server refused to answer the request: "
                    f"{message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    async def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "SGLang does not support batch inference."
        )

    async def generate_stream( # type: ignore
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> AsyncIterator[str]:
        """Return a text generator.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        AsyncIterator[str]
            An async iterator that yields the text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = await self.client.chat.completions.create(
            **client_args,
            stream=True,
        )

        async for chunk in stream:  # pragma: no cover
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

    def _build_client_args(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the SGLang client."""
        messages = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        inference_kwargs.update(output_type_args)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        client_args = {
            "messages": messages,
            **inference_kwargs,
        }

        return client_args

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client

An openai.AsyncOpenAI client instance.

required
model_name Optional[str]

The name of the model to use.

None

Parameters:

Name Type Description Default
client

An openai.AsyncOpenAI client instance.

required
Source code in outlines/models/sglang.py
def __init__(self, client, model_name: Optional[str] = None):
    """
    Parameters
    ----------
    client
        An `openai.AsyncOpenAI` client instance.
    model_name
        The name of the model to use.

    Parameters
    ----------
    client
        An `openai.AsyncOpenAI` client instance.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = SGLangTypeAdapter()

generate(model_input, output_type=None, **inference_kwargs) async

Generate text using sglang.

Parameters:

Name Type Description Default
model_input Union[Chat, str, list]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Union[str, list[str]]

The text generated by the model.

Source code in outlines/models/sglang.py
async def generate(
    self,
    model_input: Union[Chat, str, list],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Union[str, list[str]]:
    """Generate text using `sglang`.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Union[str, list[str]]
        The text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    response = await self.client.chat.completions.create(**client_args)

    messages = [choice.message for choice in response.choices]
    for message in messages:
        if message.refusal is not None:  # pragma: no cover
            raise ValueError(
                f"The sglang server refused to answer the request: "
                f"{message.refusal}"
            )

    if len(messages) == 1:
        return messages[0].content
    else:
        return [message.content for message in messages]

generate_stream(model_input, output_type=None, **inference_kwargs) async

Return a text generator.

Parameters:

Name Type Description Default
model_input Union[Chat, str, list]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
AsyncIterator[str]

An async iterator that yields the text generated by the model.

Source code in outlines/models/sglang.py
async def generate_stream( # type: ignore
    self,
    model_input: Union[Chat, str, list],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> AsyncIterator[str]:
    """Return a text generator.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    AsyncIterator[str]
        An async iterator that yields the text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    stream = await self.client.chat.completions.create(
        **client_args,
        stream=True,
    )

    async for chunk in stream:  # pragma: no cover
        if chunk.choices and chunk.choices[0].delta.content is not None:
            yield chunk.choices[0].delta.content

AsyncTGI

Bases: AsyncModel

Thin async wrapper around a huggingface_hub.AsyncInferenceClient client used to communicate with a TGI server.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the huggingface_hub.AsyncInferenceClient client.

Source code in outlines/models/tgi.py
class AsyncTGI(AsyncModel):
    """Thin async wrapper around a `huggingface_hub.AsyncInferenceClient`
    client used to communicate with a `TGI` server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the
    `huggingface_hub.AsyncInferenceClient` client.

    """

    def __init__(self, client):
        """
        Parameters
        ----------
        client
            A huggingface `AsyncInferenceClient` client instance.

        """
        self.client = client
        self.type_adapter = TGITypeAdapter()

    async def generate(
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> str:
        """Generate text using TGI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types except `CFG` are supported provided your server uses
            a backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        response = await self.client.text_generation(**client_args)

        return response

    async def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError("TGI does not support batch inference.")

    async def generate_stream( # type: ignore
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> AsyncIterator[str]:
        """Stream text using TGI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types except `CFG` are supported provided your server uses
            a backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        AsyncIterator[str]
            An async iterator that yields the text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = await self.client.text_generation(
            **client_args, stream=True
        )

        async for chunk in stream:  # pragma: no cover
            yield chunk

    def _build_client_args(
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the TGI client."""
        prompt = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        inference_kwargs.update(output_type_args)

        client_args = {
            "prompt": prompt,
            **inference_kwargs,
        }

        return client_args

__init__(client)

Parameters:

Name Type Description Default
client

A huggingface AsyncInferenceClient client instance.

required
Source code in outlines/models/tgi.py
def __init__(self, client):
    """
    Parameters
    ----------
    client
        A huggingface `AsyncInferenceClient` client instance.

    """
    self.client = client
    self.type_adapter = TGITypeAdapter()

generate(model_input, output_type=None, **inference_kwargs) async

Generate text using TGI.

Parameters:

Name Type Description Default
model_input str

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types except CFG are supported provided your server uses a backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
str

The text generated by the model.

Source code in outlines/models/tgi.py
async def generate(
    self,
    model_input: str,
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> str:
    """Generate text using TGI.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types except `CFG` are supported provided your server uses
        a backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    str
        The text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    response = await self.client.text_generation(**client_args)

    return response

generate_stream(model_input, output_type=None, **inference_kwargs) async

Stream text using TGI.

Parameters:

Name Type Description Default
model_input str

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types except CFG are supported provided your server uses a backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
AsyncIterator[str]

An async iterator that yields the text generated by the model.

Source code in outlines/models/tgi.py
async def generate_stream( # type: ignore
    self,
    model_input: str,
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> AsyncIterator[str]:
    """Stream text using TGI.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types except `CFG` are supported provided your server uses