Skip to content

vllm_offline

Integration with the vllm library (offline mode).

VLLMOffline

Bases: Model

Thin wrapper around a vllm.LLM model.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the vllm.LLM model.

Source code in outlines/models/vllm_offline.py
class VLLMOffline(Model):
    """Thin wrapper around a `vllm.LLM` model.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `vllm.LLM` model.

    """

    def __init__(self, model: "LLM"):
        """Create a VLLM model instance.

        Parameters
        ----------
        model
            A `vllm.LLM` model instance.

        """
        self.model = model
        self.type_adapter = VLLMOfflineTypeAdapter()
        self.lora_request = None # v0 legacy, to be removed

    def generate(
        self,
        model_input: Union[str, List[str]],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, List[str], List[List[str]]]:
        """Generate text using vLLM.

        Parameters
        ----------
        prompt
            The prompt based on which the model will generate a response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        inference_kwargs
            Additional keyword arguments to pass to the `generate` method
            in the `vllm.LLM` model.

        Returns
        -------
        Union[str, List[str], List[List[str]]]
            The text generated by the model.

        """
        from vllm.sampling_params import GuidedDecodingParams, SamplingParams

        sampling_params = inference_kwargs.pop("sampling_params", None)

        if sampling_params is None:
            sampling_params = SamplingParams()

        output_type_args = self.type_adapter.format_output_type(output_type)
        if output_type_args:
            sampling_params.guided_decoding = GuidedDecodingParams(**output_type_args)

        results = self.model.generate(
            self.type_adapter.format_input(model_input),
            sampling_params=sampling_params,
            lora_request=self.lora_request, # v0 legacy, to be removed
            **inference_kwargs,
        )
        results = [[sample.text for sample in batch.outputs] for batch in results]

        batch_size = len(results)
        sample_size = len(results[0])

        if batch_size == 1 and sample_size == 1:
            return results[0][0]
        elif batch_size == 1:
            return results[0]
        elif sample_size == 1:
            return [batch[0] for batch in results]

        return results

    def generate_stream(self, model_input, output_type, **inference_kwargs):
        """Not available for `vllm.LLM`.

        TODO: Implement the streaming functionality ourselves.

        """
        raise NotImplementedError(
            "Streaming is not available for the vLLM integration."
        )

    def load_lora(self, adapter_path: Optional[str]) -> None:
        """Load a LoRA adapter. Deprecated since v1.0.0.

        Use the `lora_request` argument when calling the model or generator
        instead.

        """
        warnings.warn("""
            The `load_lora` method is deprecated starting from v1.0.0.
            Support for it will be removed in v1.1.0.
            Please use the v1 of the `outlines` library by using the
            `outlines.from_vllm` function to create a `VLLM` model
            instance.
            In the v1, you must pass the `lora_request` argument as
            a keyword argument when calling the model or generator.
            """)

        from vllm.lora.request import LoRARequest

        if adapter_path is None:
            self.lora_request = None
        else:
            self.lora_request = LoRARequest(adapter_path, 1, adapter_path)

__init__(model)

Create a VLLM model instance.

Parameters:

Name Type Description Default
model LLM

A vllm.LLM model instance.

required
Source code in outlines/models/vllm_offline.py
def __init__(self, model: "LLM"):
    """Create a VLLM model instance.

    Parameters
    ----------
    model
        A `vllm.LLM` model instance.

    """
    self.model = model
    self.type_adapter = VLLMOfflineTypeAdapter()
    self.lora_request = None # v0 legacy, to be removed

generate(model_input, output_type=None, **inference_kwargs)

Generate text using vLLM.

Parameters:

Name Type Description Default
prompt

The prompt based on which the model will generate a response.

required
output_type Optional[Any]

The logits processor the model will use to constrain the format of the generated text.

None
inference_kwargs Any

Additional keyword arguments to pass to the generate method in the vllm.LLM model.

{}

Returns:

Type Description
Union[str, List[str], List[List[str]]]

The text generated by the model.

Source code in outlines/models/vllm_offline.py
def generate(
    self,
    model_input: Union[str, List[str]],
    output_type: Optional[Any] = None,
    **inference_kwargs: Any,
) -> Union[str, List[str], List[List[str]]]:
    """Generate text using vLLM.

    Parameters
    ----------
    prompt
        The prompt based on which the model will generate a response.
    output_type
        The logits processor the model will use to constrain the format of
        the generated text.
    inference_kwargs
        Additional keyword arguments to pass to the `generate` method
        in the `vllm.LLM` model.

    Returns
    -------
    Union[str, List[str], List[List[str]]]
        The text generated by the model.

    """
    from vllm.sampling_params import GuidedDecodingParams, SamplingParams

    sampling_params = inference_kwargs.pop("sampling_params", None)

    if sampling_params is None:
        sampling_params = SamplingParams()

    output_type_args = self.type_adapter.format_output_type(output_type)
    if output_type_args:
        sampling_params.guided_decoding = GuidedDecodingParams(**output_type_args)

    results = self.model.generate(
        self.type_adapter.format_input(model_input),
        sampling_params=sampling_params,
        lora_request=self.lora_request, # v0 legacy, to be removed
        **inference_kwargs,
    )
    results = [[sample.text for sample in batch.outputs] for batch in results]

    batch_size = len(results)
    sample_size = len(results[0])

    if batch_size == 1 and sample_size == 1:
        return results[0][0]
    elif batch_size == 1:
        return results[0]
    elif sample_size == 1:
        return [batch[0] for batch in results]

    return results

generate_stream(model_input, output_type, **inference_kwargs)

Not available for vllm.LLM.

TODO: Implement the streaming functionality ourselves.

Source code in outlines/models/vllm_offline.py
def generate_stream(self, model_input, output_type, **inference_kwargs):
    """Not available for `vllm.LLM`.

    TODO: Implement the streaming functionality ourselves.

    """
    raise NotImplementedError(
        "Streaming is not available for the vLLM integration."
    )

load_lora(adapter_path)

Load a LoRA adapter. Deprecated since v1.0.0.

Use the lora_request argument when calling the model or generator instead.

Source code in outlines/models/vllm_offline.py
def load_lora(self, adapter_path: Optional[str]) -> None:
    """Load a LoRA adapter. Deprecated since v1.0.0.

    Use the `lora_request` argument when calling the model or generator
    instead.

    """
    warnings.warn("""
        The `load_lora` method is deprecated starting from v1.0.0.
        Support for it will be removed in v1.1.0.
        Please use the v1 of the `outlines` library by using the
        `outlines.from_vllm` function to create a `VLLM` model
        instance.
        In the v1, you must pass the `lora_request` argument as
        a keyword argument when calling the model or generator.
        """)

    from vllm.lora.request import LoRARequest

    if adapter_path is None:
        self.lora_request = None
    else:
        self.lora_request = LoRARequest(adapter_path, 1, adapter_path)

VLLMOfflineTypeAdapter

Bases: ModelTypeAdapter

Type adapter for the VLLMOffline model.

Source code in outlines/models/vllm_offline.py
class VLLMOfflineTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `VLLMOffline` model."""

    @singledispatchmethod
    def format_input(self, model_input):
        """Generate the prompt argument to pass to the model.

        Argument
        --------
        model_input
            The input passed by the user.

        """
        raise NotImplementedError(
            f"The input type {input} is not available. "
            "Please use a string or a list of strings."
        )

    @format_input.register(str)
    def format_str_input(self, model_input: str) -> str:
        return model_input

    @format_input.register(list)
    def format_list_input(self, model_input: List[str]) -> List[str]:
        return model_input

    def format_output_type(self, output_type: Optional[Any] = None) -> dict:
        """Generate the structured output argument to pass to the model.

        For vLLM, the structured output definition is set in the
        `GuidedDecodingParams` constructor that is provided as a value to the
        `guided_decoding` parameter of the `SamplingParams` constructor, itself
        provided as a value to the `sampling_params` parameter of the `generate`
        method.

        Parameters
        ----------
        output_type
            The structured output type provided.

        Returns
        -------
        dict
            The arguments to provide to the `GuidedDecodingParams` constructor.

        """
        if output_type is None:
            return {}

        term = python_types_to_terms(output_type)
        if isinstance(term, CFG):
            return {"grammar": term.definition}
        elif isinstance(term, JsonSchema):
            guided_decoding_params = {"json": json.loads(term.schema)}
            if term.whitespace_pattern:
                guided_decoding_params["whitespace_pattern"] = term.whitespace_pattern
            return guided_decoding_params
        else:
            return {"regex": to_regex(term)}

format_input(model_input)

Generate the prompt argument to pass to the model.

Argument

model_input The input passed by the user.

Source code in outlines/models/vllm_offline.py
@singledispatchmethod
def format_input(self, model_input):
    """Generate the prompt argument to pass to the model.

    Argument
    --------
    model_input
        The input passed by the user.

    """
    raise NotImplementedError(
        f"The input type {input} is not available. "
        "Please use a string or a list of strings."
    )

format_output_type(output_type=None)

Generate the structured output argument to pass to the model.

For vLLM, the structured output definition is set in the GuidedDecodingParams constructor that is provided as a value to the guided_decoding parameter of the SamplingParams constructor, itself provided as a value to the sampling_params parameter of the generate method.

Parameters:

Name Type Description Default
output_type Optional[Any]

The structured output type provided.

None

Returns:

Type Description
dict

The arguments to provide to the GuidedDecodingParams constructor.

Source code in outlines/models/vllm_offline.py
def format_output_type(self, output_type: Optional[Any] = None) -> dict:
    """Generate the structured output argument to pass to the model.

    For vLLM, the structured output definition is set in the
    `GuidedDecodingParams` constructor that is provided as a value to the
    `guided_decoding` parameter of the `SamplingParams` constructor, itself
    provided as a value to the `sampling_params` parameter of the `generate`
    method.

    Parameters
    ----------
    output_type
        The structured output type provided.

    Returns
    -------
    dict
        The arguments to provide to the `GuidedDecodingParams` constructor.

    """
    if output_type is None:
        return {}

    term = python_types_to_terms(output_type)
    if isinstance(term, CFG):
        return {"grammar": term.definition}
    elif isinstance(term, JsonSchema):
        guided_decoding_params = {"json": json.loads(term.schema)}
        if term.whitespace_pattern:
            guided_decoding_params["whitespace_pattern"] = term.whitespace_pattern
        return guided_decoding_params
    else:
        return {"regex": to_regex(term)}

from_vllm_offline(model)

Create an Outlines VLLMOffline model instance from a vllm.LLM instance.

Parameters:

Name Type Description Default
model LLM

A vllm.LLM instance.

required

Returns:

Type Description
VLLMOffline

An Outlines VLLMOffline model instance.

Source code in outlines/models/vllm_offline.py
def from_vllm_offline(model: "LLM") -> VLLMOffline:
    """Create an Outlines `VLLMOffline` model instance from a `vllm.LLM`
    instance.

    Parameters
    ----------
    model
        A `vllm.LLM` instance.

    Returns
    -------
    VLLMOffline
        An Outlines `VLLMOffline` model instance.

    """
    return VLLMOffline(model)