vllm

Integration with a vLLM server.

`AsyncVLLM`

Bases: AsyncModel

Thin async wrapper around the openai.OpenAI client used to communicate with a vllm server.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the openai.OpenAI client for the vllm server.

Source code in outlines/models/vllm.py

class AsyncVLLM(AsyncModel):
    """Thin async wrapper around the `openai.OpenAI` client used to communicate
    with a `vllm` server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.OpenAI` client for the
    `vllm` server.
    """

    def __init__(
        self,
        client: "AsyncOpenAI",
        model_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            An `openai.AsyncOpenAI` client instance.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = VLLMTypeAdapter()

    async def generate(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using vLLM.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        response = await self.client.chat.completions.create(**client_args)

        messages = [choice.message for choice in response.choices]
        for message in messages:
            if message.refusal is not None:  # pragma: no cover
                raise ValueError(
                    f"The vLLM server refused to answer the request: "
                    f"{message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    async def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError("VLLM does not support batch inference.")

    async def generate_stream( # type: ignore
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> AsyncIterator[str]:
        """Stream text using vLLM.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        AsyncIterator[str]
            An async iterator that yields the text generated by the model.
        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = await self.client.chat.completions.create(
            **client_args,
            stream=True,
        )

        async for chunk in stream:  # pragma: no cover
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

    def _build_client_args(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the OpenAI client."""
        messages = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        extra_body = inference_kwargs.pop("extra_body", {})
        extra_body.update(output_type_args)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        client_args = {
            "messages": messages,
            **inference_kwargs,
        }
        if extra_body:
            client_args["extra_body"] = extra_body

        return client_args

`init(client, model_name=None)`

Parameters:

Name	Type	Description	Default
`client`	`AsyncOpenAI`	An `openai.AsyncOpenAI` client instance.	required

Source code in outlines/models/vllm.py

def __init__(
    self,
    client: "AsyncOpenAI",
    model_name: Optional[str] = None,
):
    """
    Parameters
    ----------
    client
        An `openai.AsyncOpenAI` client instance.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = VLLMTypeAdapter()

`generate(model_input, output_type=None, **inference_kwargs)` `async`

Generate text using vLLM.

Parameters:

Name	Type	Description	Default
`model_input`	`Union[Chat, str, list]`	The prompt based on which the model will generate a response.	required
`output_type`	`Optional[Any]`	The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.	`None`
`inference_kwargs`	`Any`	Additional keyword arguments to pass to the client.	`{}`

Returns:

Type	Description
`Union[str, list[str]]`	The text generated by the model.

Source code in outlines/models/vllm.py

async def generate(
    self,
    model_input: Union[Chat, str, list],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Union[str, list[str]]:
    """Generate text using vLLM.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Union[str, list[str]]
        The text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    response = await self.client.chat.completions.create(**client_args)

    messages = [choice.message for choice in response.choices]
    for message in messages:
        if message.refusal is not None:  # pragma: no cover
            raise ValueError(
                f"The vLLM server refused to answer the request: "
                f"{message.refusal}"
            )

    if len(messages) == 1:
        return messages[0].content
    else:
        return [message.content for message in messages]

`generate_stream(model_input, output_type=None, **inference_kwargs)` `async`

Stream text using vLLM.

Parameters:

Name	Type	Description	Default
`model_input`	`Union[Chat, str, list]`	The prompt based on which the model will generate a response.	required
`output_type`	`Optional[Any]`	The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.	`None`
`inference_kwargs`	`Any`	Additional keyword arguments to pass to the client.	`{}`

Returns:

Type	Description
`AsyncIterator[str]`	An async iterator that yields the text generated by the model.

Source code in outlines/models/vllm.py

async def generate_stream( # type: ignore
    self,
    model_input: Union[Chat, str, list],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> AsyncIterator[str]:
    """Stream text using vLLM.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    AsyncIterator[str]
        An async iterator that yields the text generated by the model.
    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    stream = await self.client.chat.completions.create(
        **client_args,
        stream=True,
    )

    async for chunk in stream:  # pragma: no cover
        if chunk.choices and chunk.choices[0].delta.content is not None:
            yield chunk.choices[0].delta.content

`VLLM`

Bases: Model

Thin wrapper around the openai.OpenAI client used to communicate with a vllm server.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the openai.OpenAI client for the vllm server.

Source code in outlines/models/vllm.py

class VLLM(Model):
    """Thin wrapper around the `openai.OpenAI` client used to communicate with
    a `vllm` server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.OpenAI` client for the
    `vllm` server.
    """

    def __init__(
        self,
        client: "OpenAI",
        model_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            An `openai.OpenAI` client instance.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = VLLMTypeAdapter()

    def generate(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using vLLM.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input,
            output_type,
            **inference_kwargs,
        )

        response = self.client.chat.completions.create(**client_args)

        messages = [choice.message for choice in response.choices]
        for message in messages:
            if message.refusal is not None:  # pragma: no cover
                raise ValueError(
                    f"The vLLM server refused to answer the request: "
                    f"{message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError("VLLM does not support batch inference.")

    def generate_stream(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using vLLM.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = self.client.chat.completions.create(
            **client_args, stream=True,
        )

        for chunk in stream:  # pragma: no cover
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

    def _build_client_args(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the OpenAI client."""
        messages = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        extra_body = inference_kwargs.pop("extra_body", {})
        extra_body.update(output_type_args)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        client_args = {
            "messages": messages,
            **inference_kwargs,
        }
        if extra_body:
            client_args["extra_body"] = extra_body

        return client_args

`init(client, model_name=None)`

Parameters:

Name	Type	Description	Default
`client`	`OpenAI`	An `openai.OpenAI` client instance.	required

Source code in outlines/models/vllm.py

def __init__(
    self,
    client: "OpenAI",
    model_name: Optional[str] = None,
):
    """
    Parameters
    ----------
    client
        An `openai.OpenAI` client instance.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = VLLMTypeAdapter()

`generate(model_input, output_type=None, **inference_kwargs)`

Generate text using vLLM.

Parameters:

Name	Type	Description	Default
`model_input`	`Union[Chat, str, list]`	The prompt based on which the model will generate a response.	required
`output_type`	`Optional[Any]`	The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.	`None`
`inference_kwargs`	`Any`	Additional keyword arguments to pass to the client.	`{}`

Returns:

Type	Description
`Union[str, list[str]]`	The text generated by the model.

Source code in outlines/models/vllm.py

def generate(
    self,
    model_input: Union[Chat, str, list],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Union[str, list[str]]:
    """Generate text using vLLM.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Union[str, list[str]]
        The text generated by the model.

    """
    client_args = self._build_client_args(
        model_input,
        output_type,
        **inference_kwargs,
    )

    response = self.client.chat.completions.create(**client_args)

    messages = [choice.message for choice in response.choices]
    for message in messages:
        if message.refusal is not None:  # pragma: no cover
            raise ValueError(
                f"The vLLM server refused to answer the request: "
                f"{message.refusal}"
            )

    if len(messages) == 1:
        return messages[0].content
    else:
        return [message.content for message in messages]

`generate_stream(model_input, output_type=None, **inference_kwargs)`

Stream text using vLLM.

Parameters:

Name	Type	Description	Default
`model_input`	`Union[Chat, str, list]`	The prompt based on which the model will generate a response.	required
`output_type`	`Optional[Any]`	The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.	`None`
`inference_kwargs`	`Any`	Additional keyword arguments to pass to the client.	`{}`

Returns:

Type	Description
`Iterator[str]`	An iterator that yields the text generated by the model.

Source code in outlines/models/vllm.py

def generate_stream(
    self,
    model_input: Union[Chat, str, list],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Iterator[str]:
    """Stream text using vLLM.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    stream = self.client.chat.completions.create(
        **client_args, stream=True,
    )

    for chunk in stream:  # pragma: no cover
        if chunk.choices and chunk.choices[0].delta.content is not None:
            yield chunk.choices[0].delta.content

`VLLMTypeAdapter`

Bases: ModelTypeAdapter

Type adapter for the VLLM and AsyncVLLM models.

Source code in outlines/models/vllm.py

class VLLMTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `VLLM` and `AsyncVLLM` models."""

    def format_input(self, model_input: Union[Chat, str, list]) -> list:
        """Generate the value of the messages argument to pass to the client.

        We rely on the OpenAITypeAdapter to format the input as the vLLM server
        expects input in the same format as OpenAI.

        Parameters
        ----------
        model_input
            The input passed by the user.

        Returns
        -------
        list
            The formatted input to be passed to the model.

        """
        return OpenAITypeAdapter().format_input(model_input)

    def format_output_type(self, output_type: Optional[Any] = None) -> dict:
        """Generate the structured output argument to pass to the client.

        Parameters
        ----------
        output_type
            The structured output type provided.

        Returns
        -------
        dict
            The structured output argument to pass to the model.

        """
        if output_type is None:
            return {}

        term = python_types_to_terms(output_type)
        if isinstance(term, CFG):
            return {"guided_grammar": term.definition}
        elif isinstance(term, JsonSchema):
            extra_body = {"guided_json": json.loads(term.schema)}
            if term.whitespace_pattern:
                extra_body["whitespace_pattern"] = term.whitespace_pattern
            return extra_body
        else:
            return {"guided_regex": to_regex(term)}

`format_input(model_input)`

Generate the value of the messages argument to pass to the client.

We rely on the OpenAITypeAdapter to format the input as the vLLM server expects input in the same format as OpenAI.

Parameters:

Name	Type	Description	Default
`model_input`	`Union[Chat, str, list]`	The input passed by the user.	required

Returns:

Type	Description
`list`	The formatted input to be passed to the model.

Source code in outlines/models/vllm.py

def format_input(self, model_input: Union[Chat, str, list]) -> list:
    """Generate the value of the messages argument to pass to the client.

    We rely on the OpenAITypeAdapter to format the input as the vLLM server
    expects input in the same format as OpenAI.

    Parameters
    ----------
    model_input
        The input passed by the user.

    Returns
    -------
    list
        The formatted input to be passed to the model.

    """
    return OpenAITypeAdapter().format_input(model_input)

`format_output_type(output_type=None)`

Generate the structured output argument to pass to the client.

Parameters:

Name	Type	Description	Default
`output_type`	`Optional[Any]`	The structured output type provided.	`None`

Returns:

Type	Description
`dict`	The structured output argument to pass to the model.

Source code in outlines/models/vllm.py

def format_output_type(self, output_type: Optional[Any] = None) -> dict:
    """Generate the structured output argument to pass to the client.

    Parameters
    ----------
    output_type
        The structured output type provided.

    Returns
    -------
    dict
        The structured output argument to pass to the model.

    """
    if output_type is None:
        return {}

    term = python_types_to_terms(output_type)
    if isinstance(term, CFG):
        return {"guided_grammar": term.definition}
    elif isinstance(term, JsonSchema):
        extra_body = {"guided_json": json.loads(term.schema)}
        if term.whitespace_pattern:
            extra_body["whitespace_pattern"] = term.whitespace_pattern
        return extra_body
    else:
        return {"guided_regex": to_regex(term)}

`from_vllm(client, model_name=None)`

Create an Outlines VLLM or AsyncVLLM model instance from an openai.OpenAI or openai.AsyncOpenAI instance.

Parameters:

Name	Type	Description	Default
`client`	`Union[OpenAI, AsyncOpenAI]`	An `openai.OpenAI` or `openai.AsyncOpenAI` instance.	required
`model_name`	`Optional[str]`	The name of the model to use.	`None`

Returns:

Type	Description
`Union[VLLM, AsyncVLLM]`	An Outlines `VLLM` or `AsyncVLLM` model instance.

Source code in outlines/models/vllm.py

def from_vllm(
    client: Union["OpenAI", "AsyncOpenAI"],
    model_name: Optional[str] = None,
) -> Union[VLLM, AsyncVLLM]:
    """Create an Outlines `VLLM` or `AsyncVLLM` model instance from an
    `openai.OpenAI` or `openai.AsyncOpenAI` instance.

    Parameters
    ----------
    client
        An `openai.OpenAI` or `openai.AsyncOpenAI` instance.
    model_name
        The name of the model to use.

    Returns
    -------
    Union[VLLM, AsyncVLLM]
        An Outlines `VLLM` or `AsyncVLLM` model instance.

    """
    from openai import AsyncOpenAI, OpenAI

    if isinstance(client, OpenAI):
        return VLLM(client, model_name)
    elif isinstance(client, AsyncOpenAI):
        return AsyncVLLM(client, model_name)
    else:
        raise ValueError(
            f"Unsupported client type: {type(client)}.\n"
            "Please provide an OpenAI or AsyncOpenAI instance."
        )

vllm

AsyncVLLM

__init__(client, model_name=None)

generate(model_input, output_type=None, **inference_kwargs) async

generate_stream(model_input, output_type=None, **inference_kwargs) async

VLLM

__init__(client, model_name=None)

generate(model_input, output_type=None, **inference_kwargs)

generate_stream(model_input, output_type=None, **inference_kwargs)

VLLMTypeAdapter

format_input(model_input)

format_output_type(output_type=None)

from_vllm(client, model_name=None)

`AsyncVLLM`

`init(client, model_name=None)`

`generate(model_input, output_type=None, **inference_kwargs)` `async`

`generate_stream(model_input, output_type=None, **inference_kwargs)` `async`

`VLLM`

`init(client, model_name=None)`

`generate(model_input, output_type=None, **inference_kwargs)`

`generate_stream(model_input, output_type=None, **inference_kwargs)`

`VLLMTypeAdapter`

`format_input(model_input)`

`format_output_type(output_type=None)`

`from_vllm(client, model_name=None)`