Skip to content

sglang

Integration with an SGLang server.

AsyncSGLang

Bases: AsyncModel

Thin async wrapper around the openai.OpenAI client used to communicate with an SGLang server.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the openai.OpenAI client for the SGLang server.

Source code in outlines/models/sglang.py
class AsyncSGLang(AsyncModel):
    """Thin async wrapper around the `openai.OpenAI` client used to communicate
    with an SGLang server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.OpenAI` client for the
    SGLang server.

    """

    def __init__(self, client, model_name: Optional[str] = None):
        """
        Parameters
        ----------
        client
            An `openai.AsyncOpenAI` client instance.
        model_name
            The name of the model to use.

        Parameters
        ----------
        client
            An `openai.AsyncOpenAI` client instance.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = SGLangTypeAdapter()

    async def generate(
        self,
        model_input: Union[str, Vision],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using `sglang`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        response = await self.client.chat.completions.create(**client_args)

        messages = [choice.message for choice in response.choices]
        for message in messages:
            if message.refusal is not None:  # pragma: no cover
                raise ValueError(
                    f"The sglang server refused to answer the request: "
                    f"{message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    async def generate_stream( # type: ignore
        self,
        model_input: Union[str, Vision],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> AsyncIterator[str]:
        """Return a text generator.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        AsyncIterator[str]
            An async iterator that yields the text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = await self.client.chat.completions.create(
            **client_args,
            stream=True,
        )

        async for chunk in stream:  # pragma: no cover
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

    def _build_client_args(
        self,
        model_input: Union[str, Vision],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the SGLang client."""
        messages = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        inference_kwargs.update(output_type_args)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        client_args = {
            **messages,
            **inference_kwargs,
        }

        return client_args

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client

An openai.AsyncOpenAI client instance.

required
model_name Optional[str]

The name of the model to use.

None

Parameters:

Name Type Description Default
client

An openai.AsyncOpenAI client instance.

required
Source code in outlines/models/sglang.py
def __init__(self, client, model_name: Optional[str] = None):
    """
    Parameters
    ----------
    client
        An `openai.AsyncOpenAI` client instance.
    model_name
        The name of the model to use.

    Parameters
    ----------
    client
        An `openai.AsyncOpenAI` client instance.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = SGLangTypeAdapter()

generate(model_input, output_type=None, **inference_kwargs) async

Generate text using sglang.

Parameters:

Name Type Description Default
model_input Union[str, Vision]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Union[str, list[str]]

The text generated by the model.

Source code in outlines/models/sglang.py
async def generate(
    self,
    model_input: Union[str, Vision],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Union[str, list[str]]:
    """Generate text using `sglang`.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Union[str, list[str]]
        The text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    response = await self.client.chat.completions.create(**client_args)

    messages = [choice.message for choice in response.choices]
    for message in messages:
        if message.refusal is not None:  # pragma: no cover
            raise ValueError(
                f"The sglang server refused to answer the request: "
                f"{message.refusal}"
            )

    if len(messages) == 1:
        return messages[0].content
    else:
        return [message.content for message in messages]

generate_stream(model_input, output_type=None, **inference_kwargs) async

Return a text generator.

Parameters:

Name Type Description Default
model_input Union[str, Vision]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
AsyncIterator[str]

An async iterator that yields the text generated by the model.

Source code in outlines/models/sglang.py
async def generate_stream( # type: ignore
    self,
    model_input: Union[str, Vision],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> AsyncIterator[str]:
    """Return a text generator.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    AsyncIterator[str]
        An async iterator that yields the text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    stream = await self.client.chat.completions.create(
        **client_args,
        stream=True,
    )

    async for chunk in stream:  # pragma: no cover
        if chunk.choices and chunk.choices[0].delta.content is not None:
            yield chunk.choices[0].delta.content

SGLang

Bases: Model

Thin wrapper around the openai.OpenAI client used to communicate with an SGLang server.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the openai.OpenAI client for the SGLang server.

Source code in outlines/models/sglang.py
class SGLang(Model):
    """Thin wrapper around the `openai.OpenAI` client used to communicate with
    an SGLang server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.OpenAI` client for the
    SGLang server.

    """

    def __init__(self, client, model_name: Optional[str] = None):
        """
        Parameters
        ----------
        client
            An `openai.OpenAI` client instance.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = SGLangTypeAdapter()

    def generate(
        self,
        model_input: Union[str, Vision],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using SGLang.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input,
            output_type,
            **inference_kwargs,
        )

        response = self.client.chat.completions.create(**client_args)

        messages = [choice.message for choice in response.choices]
        for message in messages:
            if message.refusal is not None:  # pragma: no cover
                raise ValueError(
                    f"The SGLang server refused to answer the request: "
                    f"{message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    def generate_stream(
        self,
        model_input: Union[str, Vision],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using SGLang.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = self.client.chat.completions.create(
            **client_args, stream=True,
        )

        for chunk in stream:  # pragma: no cover
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

    def _build_client_args(
        self,
        model_input: Union[str, Vision],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the SGLang client."""
        messages = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        inference_kwargs.update(output_type_args)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        client_args = {
            **messages,
            **inference_kwargs,
        }

        return client_args

__init__(client, model_name=None)

Parameters:

Name Type Description Default
client

An openai.OpenAI client instance.

required
model_name Optional[str]

The name of the model to use.

None
Source code in outlines/models/sglang.py
def __init__(self, client, model_name: Optional[str] = None):
    """
    Parameters
    ----------
    client
        An `openai.OpenAI` client instance.
    model_name
        The name of the model to use.

    """
    self.client = client
    self.model_name = model_name
    self.type_adapter = SGLangTypeAdapter()

generate(model_input, output_type=None, **inference_kwargs)

Generate text using SGLang.

Parameters:

Name Type Description Default
model_input Union[str, Vision]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Union[str, list[str]]

The text generated by the model.

Source code in outlines/models/sglang.py
def generate(
    self,
    model_input: Union[str, Vision],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Union[str, list[str]]:
    """Generate text using SGLang.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Union[str, list[str]]
        The text generated by the model.

    """
    client_args = self._build_client_args(
        model_input,
        output_type,
        **inference_kwargs,
    )

    response = self.client.chat.completions.create(**client_args)

    messages = [choice.message for choice in response.choices]
    for message in messages:
        if message.refusal is not None:  # pragma: no cover
            raise ValueError(
                f"The SGLang server refused to answer the request: "
                f"{message.refusal}"
            )

    if len(messages) == 1:
        return messages[0].content
    else:
        return [message.content for message in messages]

generate_stream(model_input, output_type=None, **inference_kwargs)

Stream text using SGLang.

Parameters:

Name Type Description Default
model_input Union[str, Vision]

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The desired format of the response generated by the model. All output types available in Outlines are supported provided your server uses a structured generation backend that supports them.

None
inference_kwargs Any

Additional keyword arguments to pass to the client.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/sglang.py
def generate_stream(
    self,
    model_input: Union[str, Vision],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Iterator[str]:
    """Stream text using SGLang.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The desired format of the response generated by the model. All
        output types available in Outlines are supported provided your
        server uses a structured generation backend that supports them.
    inference_kwargs
        Additional keyword arguments to pass to the client.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    client_args = self._build_client_args(
        model_input, output_type, **inference_kwargs,
    )

    stream = self.client.chat.completions.create(
        **client_args, stream=True,
    )

    for chunk in stream:  # pragma: no cover
        if chunk.choices and chunk.choices[0].delta.content is not None:
            yield chunk.choices[0].delta.content

SGLangTypeAdapter

Bases: ModelTypeAdapter

Type adapter for the SGLang and AsyncSGLang models.

Source code in outlines/models/sglang.py
class SGLangTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `SGLang` and `AsyncSGLang` models."""

    def format_input(self, model_input: Union[str, Vision]) -> dict:
        """Generate the prompt argument to pass to the client.

        We rely on the OpenAITypeAdapter to format the input as the sglang
        server expects input in the same format as OpenAI.

        Parameters
        ----------
        model_input
            The input passed by the user.

        Returns
        -------
        dict
            The formatted input to be passed to the client.

        """
        return OpenAITypeAdapter().format_input(model_input)

    def format_output_type(self, output_type: Optional[Any] = None) -> dict:
        """Generate the structured output argument to pass to the client.

        Parameters
        ----------
        output_type
            The structured output type provided.

        Returns
        -------
        dict
            The formatted output type to be passed to the client.

        """
        if output_type is None:
            return {}

        term = python_types_to_terms(output_type)
        if isinstance(term, CFG):
            warnings.warn(
                "SGLang grammar-based structured outputs expects an EBNF "
                "grammar instead of a Lark grammar as is generally used in "
                "Outlines. The grammar cannot be used as a structured output "
                "type with an outlines backend, it is only compatible with "
                "the sglang and llguidance backends."
            )
            return {"extra_body": {"ebnf": term.definition}}
        elif isinstance(term, JsonSchema):
            return OpenAITypeAdapter().format_json_output_type(
                json.loads(term.schema)
            )
        else:
            return {"extra_body": {"regex": to_regex(term)}}

format_input(model_input)

Generate the prompt argument to pass to the client.

We rely on the OpenAITypeAdapter to format the input as the sglang server expects input in the same format as OpenAI.

Parameters:

Name Type Description Default
model_input Union[str, Vision]

The input passed by the user.

required

Returns:

Type Description
dict

The formatted input to be passed to the client.

Source code in outlines/models/sglang.py
def format_input(self, model_input: Union[str, Vision]) -> dict:
    """Generate the prompt argument to pass to the client.

    We rely on the OpenAITypeAdapter to format the input as the sglang
    server expects input in the same format as OpenAI.

    Parameters
    ----------
    model_input
        The input passed by the user.

    Returns
    -------
    dict
        The formatted input to be passed to the client.

    """
    return OpenAITypeAdapter().format_input(model_input)

format_output_type(output_type=None)

Generate the structured output argument to pass to the client.

Parameters:

Name Type Description Default
output_type Optional[Any]

The structured output type provided.

None

Returns:

Type Description
dict

The formatted output type to be passed to the client.

Source code in outlines/models/sglang.py
def format_output_type(self, output_type: Optional[Any] = None) -> dict:
    """Generate the structured output argument to pass to the client.

    Parameters
    ----------
    output_type
        The structured output type provided.

    Returns
    -------
    dict
        The formatted output type to be passed to the client.

    """
    if output_type is None:
        return {}

    term = python_types_to_terms(output_type)
    if isinstance(term, CFG):
        warnings.warn(
            "SGLang grammar-based structured outputs expects an EBNF "
            "grammar instead of a Lark grammar as is generally used in "
            "Outlines. The grammar cannot be used as a structured output "
            "type with an outlines backend, it is only compatible with "
            "the sglang and llguidance backends."
        )
        return {"extra_body": {"ebnf": term.definition}}
    elif isinstance(term, JsonSchema):
        return OpenAITypeAdapter().format_json_output_type(
            json.loads(term.schema)
        )
    else:
        return {"extra_body": {"regex": to_regex(term)}}

from_sglang(client, model_name=None)

Create a SGLang or AsyncSGLang instance from an openai.OpenAI or openai.AsyncOpenAI instance.

Parameters:

Name Type Description Default
client Union[OpenAI, AsyncOpenAI]

An openai.OpenAI or openai.AsyncOpenAI instance.

required
model_name Optional[str]

The name of the model to use.

None

Returns:

Type Description
Union[SGLang, AsyncSGLang]

An Outlines SGLang or AsyncSGLang model instance.

Source code in outlines/models/sglang.py
def from_sglang(
    client: Union["OpenAI", "AsyncOpenAI"],
    model_name: Optional[str] = None,
) -> Union[SGLang, AsyncSGLang]:
    """Create a `SGLang` or `AsyncSGLang` instance from an `openai.OpenAI` or
    `openai.AsyncOpenAI` instance.

    Parameters
    ----------
    client
        An `openai.OpenAI` or `openai.AsyncOpenAI` instance.
    model_name
        The name of the model to use.

    Returns
    -------
    Union[SGLang, AsyncSGLang]
        An Outlines `SGLang` or `AsyncSGLang` model instance.

    """
    from openai import AsyncOpenAI, OpenAI

    if isinstance(client, OpenAI):
        return SGLang(client, model_name)
    elif isinstance(client, AsyncOpenAI):
        return AsyncSGLang(client, model_name)
    else:
        raise ValueError(
            f"Unsupported client type: {type(client)}.\n"
            "Please provide an OpenAI or AsyncOpenAI instance."
        )