Skip to content

Models

Module that contains all the models integrated in outlines.

We group the models in submodules by provider instead of theme (completion, chat completion, diffusers, etc.) and use routing functions everywhere else in the codebase.

exllamav2

ExLlamaV2Model

Represents a exl2 model.

Source code in outlines/models/exllamav2.py
class ExLlamaV2Model:
    """Represents a `exl2` model."""

    def __init__(
        self,
        generator: "ExLlamaV2DynamicGenerator",
        tokenizer: "OutlinesExLlamaV2Tokenizer",
        max_seq_len: int,
    ):
        self.generator = generator
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def prepare_generation_parameters(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        sampling_parameters: SamplingParameters,
        structure_logits_processor,
        **exllamav2_params: Unpack[ExllamaV2Params],
    ) -> Tuple[ExllamaV2Params, Union[str, List[str]]]:
        """Prepare the generation parameters.

        `exllamav2` uses different default values

        """
        from exllamav2.generator import ExLlamaV2Sampler

        if isinstance(prompts, str):
            prompts = [prompts]
        max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)

        if max_tokens is None:
            max_tokens = []
            for prompt in prompts:
                ids = self.generator.tokenizer.encode(
                    prompt, encode_special_tokens=True
                )
                prompt_tokens = ids.shape[-1]
                max_tokens.append(self.max_seq_len - prompt_tokens)
            exllamav2_params["max_new_tokens"] = max_tokens
        else:
            exllamav2_params["max_new_tokens"] = [
                max_tokens for _ in range(len(prompts))
            ]

        stop_conditions = [self.generator.tokenizer.eos_token_id]
        if isinstance(generation_parameters.stop_at, str):
            stop_conditions.append(generation_parameters.stop_at)
        elif isinstance(generation_parameters.stop_at, list):
            for stop_at in generation_parameters.stop_at:
                stop_conditions.append(stop_at)
        exllamav2_params["stop_conditions"] = stop_conditions
        exllamav2_params["seed"] = seed

        gen_settings = ExLlamaV2Sampler.Settings()
        if sampling_parameters.temperature is not None:
            gen_settings.temperature = sampling_parameters.temperature
        if sampling_parameters.top_p is not None:
            gen_settings.top_p = sampling_parameters.top_p
        if sampling_parameters.top_k is not None:
            gen_settings.top_k = sampling_parameters.top_k
        gen_settings.logits_processor = structure_logits_processor
        exllamav2_params["gen_settings"] = gen_settings
        if sampling_parameters.num_samples > 1:
            prompts = prompts * sampling_parameters.num_samples
            exllamav2_params["max_new_tokens"] = (
                exllamav2_params["max_new_tokens"] * sampling_parameters.num_samples
            )

        if len(prompts) == 1:
            prompts = prompts[0]

        return exllamav2_params, prompts

    def reformat_output(
        self, output: Union[str, List[str]], sampling_parameters: SamplingParameters
    ):
        """
        The purpose of this function is to reformat the output from exllamav2's output format to outline's output format.

        For exllamav2, it mainly accepts only a list or a string(they also do cfg sampling with tuples but we will ignore this for now).
        The exllamav2's logic is:

        1. If the prompt is a string, return a string. This is the same as outlines
        2. If a prompt is a list, return a list. This is not the same as outlines output in that if the list is only one element, the string is expected to be outputted.
        3. There is no such thing as num_samples, so the prompts had to be duplicated by num_samples times. Then, we had the function output a list of lists
        """
        if isinstance(output, str):
            return output
        if len(output) == 1:
            return output[0]
        if sampling_parameters.num_samples > 1:
            if len(output) == sampling_parameters.num_samples:
                return output
            assert len(output) % sampling_parameters.num_samples == 0
            num_items_per_sample = len(output) // sampling_parameters.num_samples
            new_output = []
            for i in range(sampling_parameters.num_samples):
                curr_sample = []
                for j in range(num_items_per_sample):
                    curr_sample.append(output[i * num_items_per_sample + j])
                new_output.append(curr_sample)
            return new_output
        return output

    def generate(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        structure_logits_processor,
        sampling_parameters: SamplingParameters,
        **exllamav2_params: Unpack[ExllamaV2Params],
    ) -> Union[str, List[str]]:
        exllamav2_params, prompts = self.prepare_generation_parameters(
            prompts,
            generation_parameters,
            sampling_parameters,
            structure_logits_processor,
        )
        """
        In exllamav2, it needs the max amount of new tokens generated.
        The reason exllamav2_params["max_new_tokens"] is a list is because in prepare_generation_parameters
        the max amount of tokens that can be generated by the model for each prompt(by encoding with tokenizer) is calculated.
        The minimum is picked because otherwise it might be possible for one of the
        prompts to exceed the max sequence length.
        """
        output = self.generator.generate(
            prompt=prompts,
            gen_settings=exllamav2_params["gen_settings"],
            max_new_tokens=min(exllamav2_params["max_new_tokens"]),
            completion_only=True,
            encode_special_tokens=True,
            stop_conditions=exllamav2_params["stop_conditions"],
            add_bos=False,
            seed=exllamav2_params["seed"],
        )

        return self.reformat_output(output, sampling_parameters)

    def stream(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        structure_logits_processor,
        sampling_parameters: SamplingParameters,
        **exllamav2_params: Unpack[ExllamaV2Params],
    ) -> Iterator[Union[str, List[str]]]:
        from exllamav2.generator import ExLlamaV2DynamicJob

        exllamav2_params, prompts = self.prepare_generation_parameters(
            prompts,
            generation_parameters,
            sampling_parameters,
            structure_logits_processor,
        )

        order = {}
        if isinstance(prompts, str):
            prompts = [prompts]
        batch_size = len(prompts)
        seed = exllamav2_params["seed"]
        for idx, p in enumerate(prompts):
            input_ids = self.generator.tokenizer.encode(
                p, encode_special_tokens=True, add_bos=False
            )

            job = ExLlamaV2DynamicJob(
                input_ids=input_ids,
                max_new_tokens=exllamav2_params["max_new_tokens"][idx],
                min_new_tokens=0,
                seed=seed,
                stop_conditions=exllamav2_params["stop_conditions"],
                gen_settings=exllamav2_params["gen_settings"],
                token_healing=False,
                decode_special_tokens=False,
            )

            if seed is not None:
                seed += 1

            serial = self.generator.enqueue(job)
            order[serial] = idx

        # Collect outputs until all jobs finish

        next_text = [""] * batch_size

        def token_generator() -> Iterator[str]:
            while self.generator.num_remaining_jobs():
                results = self.generator.iterate()
                for r in results:
                    idx = order[r["serial"]]
                    if r["stage"] == "streaming":
                        text = r.get("text", "")
                        next_text[idx] = text
                    if r["eos"]:
                        next_text[idx] = ""
                yield self.reformat_output(next_text, sampling_parameters)
            return

        return token_generator()

prepare_generation_parameters(prompts, generation_parameters, sampling_parameters, structure_logits_processor, **exllamav2_params)

Prepare the generation parameters.

exllamav2 uses different default values

Source code in outlines/models/exllamav2.py
def prepare_generation_parameters(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    sampling_parameters: SamplingParameters,
    structure_logits_processor,
    **exllamav2_params: Unpack[ExllamaV2Params],
) -> Tuple[ExllamaV2Params, Union[str, List[str]]]:
    """Prepare the generation parameters.

    `exllamav2` uses different default values

    """
    from exllamav2.generator import ExLlamaV2Sampler

    if isinstance(prompts, str):
        prompts = [prompts]
    max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)

    if max_tokens is None:
        max_tokens = []
        for prompt in prompts:
            ids = self.generator.tokenizer.encode(
                prompt, encode_special_tokens=True
            )
            prompt_tokens = ids.shape[-1]
            max_tokens.append(self.max_seq_len - prompt_tokens)
        exllamav2_params["max_new_tokens"] = max_tokens
    else:
        exllamav2_params["max_new_tokens"] = [
            max_tokens for _ in range(len(prompts))
        ]

    stop_conditions = [self.generator.tokenizer.eos_token_id]
    if isinstance(generation_parameters.stop_at, str):
        stop_conditions.append(generation_parameters.stop_at)
    elif isinstance(generation_parameters.stop_at, list):
        for stop_at in generation_parameters.stop_at:
            stop_conditions.append(stop_at)
    exllamav2_params["stop_conditions"] = stop_conditions
    exllamav2_params["seed"] = seed

    gen_settings = ExLlamaV2Sampler.Settings()
    if sampling_parameters.temperature is not None:
        gen_settings.temperature = sampling_parameters.temperature
    if sampling_parameters.top_p is not None:
        gen_settings.top_p = sampling_parameters.top_p
    if sampling_parameters.top_k is not None:
        gen_settings.top_k = sampling_parameters.top_k
    gen_settings.logits_processor = structure_logits_processor
    exllamav2_params["gen_settings"] = gen_settings
    if sampling_parameters.num_samples > 1:
        prompts = prompts * sampling_parameters.num_samples
        exllamav2_params["max_new_tokens"] = (
            exllamav2_params["max_new_tokens"] * sampling_parameters.num_samples
        )

    if len(prompts) == 1:
        prompts = prompts[0]

    return exllamav2_params, prompts

reformat_output(output, sampling_parameters)

The purpose of this function is to reformat the output from exllamav2's output format to outline's output format.

For exllamav2, it mainly accepts only a list or a string(they also do cfg sampling with tuples but we will ignore this for now). The exllamav2's logic is:

  1. If the prompt is a string, return a string. This is the same as outlines
  2. If a prompt is a list, return a list. This is not the same as outlines output in that if the list is only one element, the string is expected to be outputted.
  3. There is no such thing as num_samples, so the prompts had to be duplicated by num_samples times. Then, we had the function output a list of lists
Source code in outlines/models/exllamav2.py
def reformat_output(
    self, output: Union[str, List[str]], sampling_parameters: SamplingParameters
):
    """
    The purpose of this function is to reformat the output from exllamav2's output format to outline's output format.

    For exllamav2, it mainly accepts only a list or a string(they also do cfg sampling with tuples but we will ignore this for now).
    The exllamav2's logic is:

    1. If the prompt is a string, return a string. This is the same as outlines
    2. If a prompt is a list, return a list. This is not the same as outlines output in that if the list is only one element, the string is expected to be outputted.
    3. There is no such thing as num_samples, so the prompts had to be duplicated by num_samples times. Then, we had the function output a list of lists
    """
    if isinstance(output, str):
        return output
    if len(output) == 1:
        return output[0]
    if sampling_parameters.num_samples > 1:
        if len(output) == sampling_parameters.num_samples:
            return output
        assert len(output) % sampling_parameters.num_samples == 0
        num_items_per_sample = len(output) // sampling_parameters.num_samples
        new_output = []
        for i in range(sampling_parameters.num_samples):
            curr_sample = []
            for j in range(num_items_per_sample):
                curr_sample.append(output[i * num_items_per_sample + j])
            new_output.append(curr_sample)
        return new_output
    return output

exl2(model_path, draft_model_path=None, max_seq_len=None, cache_q4=False, paged=True, max_chunk_size=None)

Load an ExLlamaV2 model.

Parameters:

Name Type Description Default
model_path str

Path to the model directory.

required
device

Device to load the model on. Pass in 'cuda' for GPU or 'cpu' for CPU

required
max_seq_len Optional[int]

Maximum sequence length. Defaults to None.

None
scale_pos_emb

Scale factor for positional embeddings. Defaults to None.

required
scale_alpha_value

Scale alpha value. Defaults to None.

required
no_flash_attn

Disable flash attention. Defaults to None.

required
num_experts_per_token

Number of experts per token. Defaults to None.

required
cache_q4 bool

Use Q4 cache. Defaults to False.

False
tokenizer_kwargs

Additional keyword arguments for the tokenizer. Defaults to {}.

required
gpu_split

"auto", or VRAM allocation per GPU in GB. Auto will use exllama's autosplit feature

required
low_mem

Enable VRAM optimizations, potentially trading off speed

required
verbose

Enable if you want debugging statements

required

Returns:

Type Description
An `ExLlamaV2Model` instance.

Raises:

Type Description
`ImportError` if the `exllamav2` library is not installed.
Source code in outlines/models/exllamav2.py
def exl2(
    model_path: str,
    draft_model_path: Optional[str] = None,
    max_seq_len: Optional[int] = None,
    cache_q4: bool = False,
    paged: bool = True,
    max_chunk_size: Optional[int] = None,
) -> ExLlamaV2Model:
    """
    Load an ExLlamaV2 model.

    Parameters
    ----------
    model_path (str)
        Path to the model directory.
    device (str)
        Device to load the model on. Pass in 'cuda' for GPU or 'cpu' for CPU
    max_seq_len (Optional[int], optional)
        Maximum sequence length. Defaults to None.
    scale_pos_emb (Optional[float], optional)
        Scale factor for positional embeddings. Defaults to None.
    scale_alpha_value (Optional[float], optional)
        Scale alpha value. Defaults to None.
    no_flash_attn (Optional[bool], optional)
        Disable flash attention. Defaults to None.
    num_experts_per_token (Optional[int], optional)
        Number of experts per token. Defaults to None.
    cache_q4 (bool, optional)
        Use Q4 cache. Defaults to False.
    tokenizer_kwargs (dict, optional)
        Additional keyword arguments for the tokenizer. Defaults to {}.
    gpu_split (str)
        \"auto\", or VRAM allocation per GPU in GB. Auto will use exllama's autosplit feature
    low_mem (bool, optional)
        Enable VRAM optimizations, potentially trading off speed
    verbose (bool, optional)
        Enable if you want debugging statements

    Returns
    -------
    An `ExLlamaV2Model` instance.

    Raises
    ------
    `ImportError` if the `exllamav2` library is not installed.

    """
    try:
        from exllamav2 import (
            ExLlamaV2,
            ExLlamaV2Cache,
            ExLlamaV2Cache_Q4,
            ExLlamaV2Config,
            ExLlamaV2Tokenizer,
        )
        from exllamav2.generator import ExLlamaV2DynamicGenerator

    except ImportError:
        raise ImportError(
            "The `exllamav2`, `transformers` and `torch` libraries needs to be installed in order to use `exllamav2` models. "
            "Please run `pip install transformers torch git+https://github.com/lapp0/exllamav2@sampler-logits-processor` "
            "Documentation: https://dottxt-ai.github.io/outlines/latest/reference/models/exllamav2/"
        )
    config = ExLlamaV2Config(model_path)
    if max_chunk_size is not None:
        config.max_input_len = max_chunk_size
        config.max_attention_size = max_chunk_size**2

    config.arch_compat_overrides()
    model = ExLlamaV2(config)
    if max_seq_len is None:
        max_seq_len = -1
    if cache_q4:
        cache = ExLlamaV2Cache_Q4(model, max_seq_len=max_seq_len, lazy=True)
    else:
        cache = ExLlamaV2Cache(model, max_seq_len=max_seq_len, lazy=True)
    model.load_autosplit(cache, progress=True)

    print("Loading tokenizer...")
    tokenizer = ExLlamaV2Tokenizer(config)
    max_batch_size = 4 if paged else 1

    draft_model = None
    draft_cache = None
    if draft_model_path is not None:
        draft_config = ExLlamaV2Config(draft_model_path)
        draft_model = ExLlamaV2(draft_config)

        if cache_q4:
            draft_cache = ExLlamaV2Cache_Q4(
                draft_model, max_seq_len=max_seq_len, lazy=True
            )
        else:
            draft_cache = ExLlamaV2Cache(
                draft_model, max_seq_len=max_seq_len, lazy=True
            )

    # Initialize the generator with all default parameters
    generator = ExLlamaV2DynamicGenerator(
        model=model,
        cache=cache,
        draft_model=draft_model,
        draft_cache=draft_cache,
        tokenizer=tokenizer,
        max_batch_size=max_batch_size,
        use_ngram_draft=False,
        max_chunk_size=max_chunk_size,
        paged=paged,
    )
    max_seq_len = cache.max_seq_len

    outlines_tokenizer = OutlinesExLlamaV2Tokenizer(tokenizer)
    outlines_exl2_model = ExLlamaV2Model(generator, outlines_tokenizer, max_seq_len)
    return outlines_exl2_model

llamacpp

LlamaCpp

Represents a model provided by the llama-cpp-python library.

We wrap models from model providing libraries in order to give all of them the same interface in Outlines and allow users to easily switch between providers. This class wraps the llama_cpp.Llama class from the llama-cpp-python library.

Source code in outlines/models/llamacpp.py
class LlamaCpp:
    """Represents a model provided by the `llama-cpp-python` library.

    We wrap models from model providing libraries in order to give all of
    them the same interface in Outlines and allow users to easily switch
    between providers. This class wraps the `llama_cpp.Llama` class from the
    `llama-cpp-python` library.

    """

    def __init__(self, model: "Llama"):
        self.model = model

    @property
    def tokenizer(self):
        return LlamaCppTokenizer(self.model)

    def prepare_generation_parameters(
        self,
        generation_parameters: GenerationParameters,
        sampling_parameters: SamplingParameters,
        structure_logits_processor,
        **llama_cpp_params: Unpack[LlamaCppParams],
    ):
        """Prepare the generation parameters.

        `llama-cpp-python` uses different default values

        """
        from llama_cpp import LogitsProcessorList

        max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)

        # We update `llama_cpp_params` with the values the user passed to the
        # generator.
        if "stop" not in llama_cpp_params:
            llama_cpp_params["stop"] = stop_at
        if "seed" not in llama_cpp_params:
            llama_cpp_params["seed"] = seed

        # Somehow `llama-cpp-python` generates `max_tokens + 1`  tokens
        if "max_tokens" not in llama_cpp_params:
            if max_tokens is None:
                llama_cpp_params["max_tokens"] = -1  # indicates unlimited tokens
            else:
                llama_cpp_params["max_tokens"] = max_tokens - 1
        else:
            llama_cpp_params["max_tokens"] = llama_cpp_params["max_tokens"] - 1

        sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
            sampling_parameters
        )

        # We update the `llama_cpp_params` with the sampling values that
        # were specified by the user via the `Sampler` class, unless they
        # are also specified in `llama_cpp_params`. We also disable other
        # sampling methods that are enabled by default and reset the temperature
        # value.
        #
        # See https://github.com/ggerganov/llama.cpp/blob/e11a8999b5690f810c2c99c14347f0834e68c524/common/sampling.h#L22
        # for the default values in `llama.cpp` and indications to disable the sampling modes.
        # Mirostat sampling, tail-free sampling and all penalties are disabled by default.
        #
        # See https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__
        # for default values in `llama-cpp-python`
        if sampler == "beam_search":
            raise NotImplementedError(
                "The `llama_cpp_python` library does not support Beam Search."
            )
        if num_samples != 1:
            raise NotImplementedError(
                "The `llama_cpp_python` library does not allow to take several samples."
            )
        if "top_p" not in llama_cpp_params:
            if top_p is not None:
                llama_cpp_params["top_p"] = top_p
            else:
                llama_cpp_params["top_p"] = 1.0

        if "min_p" not in llama_cpp_params:
            llama_cpp_params["min_p"] = 0.0

        if "top_k" not in llama_cpp_params:
            if top_k is not None:
                llama_cpp_params["top_k"] = top_k
            else:
                llama_cpp_params["top_k"] = -1

        if "temperature" not in llama_cpp_params:
            if temperature is not None:
                llama_cpp_params["temperature"] = temperature
            else:
                llama_cpp_params["temperature"] = 1.0

        if "repeat_penalty" not in llama_cpp_params:
            llama_cpp_params["repeat_penalty"] = 1.0

        # The choice to stream or not should happen via the high-level API
        llama_cpp_params["stream"] = False

        if structure_logits_processor is not None:
            if "logits_processor" in llama_cpp_params:
                llama_cpp_params["logits_processor"].append(structure_logits_processor)
            else:
                llama_cpp_params["logits_processor"] = LogitsProcessorList(
                    [structure_logits_processor]
                )

        return llama_cpp_params

    def generate(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        structure_logits_processor,
        sampling_parameters: SamplingParameters,
        **llama_cpp_params: Unpack[LlamaCppParams],
    ) -> str:
        """Generate text using `llama-cpp-python`.

        Parameters
        ----------
        prompts
            A prompt or list of prompts.
        generation_parameters
            An instance of `GenerationParameters` that contains the prompt,
            the maximum number of tokens, stop sequences and seed. All the
            arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
        logits_processor
            The logits processor to use when generating text.
        sampling_parameters
            An instance of `SamplingParameters`, a dataclass that contains
            the name of the sampler to use and related parameters as available
            in Outlines.
        llama_cpp_params
            Keyword arguments that can be passed to
            `llama_cpp_python.Llama.__call__`.  The values in `llama_cpp_params`
            supersede the values of the parameters in `generation_parameters` and
            `sampling_parameters`.  See the `llama_cpp_python` documentation for
            a list of possible values: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__

        Returns
        -------
        The generated text.

        """
        if not isinstance(prompts, str):
            raise NotImplementedError(
                "The `llama-cpp-python` library does not support batch inference."
            )

        llama_cpp_params = self.prepare_generation_parameters(
            generation_parameters,
            sampling_parameters,
            structure_logits_processor,
            **llama_cpp_params,
        )
        completion = self.model(prompts, **llama_cpp_params)
        result = completion["choices"][0]["text"]

        self.model.reset()

        return result

    def stream(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        structure_logits_processor,
        sampling_parameters: SamplingParameters,
        **llama_cpp_params: Unpack[LlamaCppParams],
    ) -> Iterator[str]:
        """Stream text using `llama-cpp-python`.

        Parameters
        ----------
        prompts
            A prompt or list of prompts.
        generation_parameters
            An instance of `GenerationParameters` that contains the prompt,
            the maximum number of tokens, stop sequences and seed. All the
            arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
        logits_processor
            The logits processor to use when generating text.
        sampling_parameters
            An instance of `SamplingParameters`, a dataclass that contains
            the name of the sampler to use and related parameters as available
            in Outlines.
        llama_cpp_params
            Keyword arguments that can be passed to
            `llama_cpp_python.Llama.__call__`.  The values in `llama_cpp_params`
            supersede the values of the parameters in `generation_parameters` and
            `sampling_parameters`.  See the `llama_cpp_python` documentation for
            a list of possible values: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__

        Returns
        -------
        A generator that return strings.

        """

        if not isinstance(prompts, str):
            raise NotImplementedError(
                "The `llama-cpp-python` library does not support batch inference."
            )

        llama_cpp_params = self.prepare_generation_parameters(
            generation_parameters,
            sampling_parameters,
            structure_logits_processor,
            **llama_cpp_params,
        )
        llama_cpp_params["stream"] = True
        generator = self.model(prompts, **llama_cpp_params)

        def token_generator() -> Iterator[str]:
            while True:
                try:
                    result = next(generator)
                    yield result["choices"][0]["text"]
                except StopIteration:
                    self.model.reset()
                    return

        return token_generator()

    def load_lora(self, adapter_path: str):
        if self.model._model.apply_lora_from_file(
            adapter_path,
            1.0,
        ):
            raise RuntimeError(f"Failed to apply LoRA from lora path: {adapter_path}")

generate(prompts, generation_parameters, structure_logits_processor, sampling_parameters, **llama_cpp_params)

Generate text using llama-cpp-python.

Parameters:

Name Type Description Default
prompts Union[str, List[str]]

A prompt or list of prompts.

required
generation_parameters GenerationParameters

An instance of GenerationParameters that contains the prompt, the maximum number of tokens, stop sequences and seed. All the arguments to SequenceGeneratorAdapter's __cal__ method.

required
logits_processor

The logits processor to use when generating text.

required
sampling_parameters SamplingParameters

An instance of SamplingParameters, a dataclass that contains the name of the sampler to use and related parameters as available in Outlines.

required
llama_cpp_params Unpack[LlamaCppParams]

Keyword arguments that can be passed to llama_cpp_python.Llama.__call__. The values in llama_cpp_params supersede the values of the parameters in generation_parameters and sampling_parameters. See the llama_cpp_python documentation for a list of possible values: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.call

{}

Returns:

Type Description
The generated text.
Source code in outlines/models/llamacpp.py
def generate(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    structure_logits_processor,
    sampling_parameters: SamplingParameters,
    **llama_cpp_params: Unpack[LlamaCppParams],
) -> str:
    """Generate text using `llama-cpp-python`.

    Parameters
    ----------
    prompts
        A prompt or list of prompts.
    generation_parameters
        An instance of `GenerationParameters` that contains the prompt,
        the maximum number of tokens, stop sequences and seed. All the
        arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
    logits_processor
        The logits processor to use when generating text.
    sampling_parameters
        An instance of `SamplingParameters`, a dataclass that contains
        the name of the sampler to use and related parameters as available
        in Outlines.
    llama_cpp_params
        Keyword arguments that can be passed to
        `llama_cpp_python.Llama.__call__`.  The values in `llama_cpp_params`
        supersede the values of the parameters in `generation_parameters` and
        `sampling_parameters`.  See the `llama_cpp_python` documentation for
        a list of possible values: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__

    Returns
    -------
    The generated text.

    """
    if not isinstance(prompts, str):
        raise NotImplementedError(
            "The `llama-cpp-python` library does not support batch inference."
        )

    llama_cpp_params = self.prepare_generation_parameters(
        generation_parameters,
        sampling_parameters,
        structure_logits_processor,
        **llama_cpp_params,
    )
    completion = self.model(prompts, **llama_cpp_params)
    result = completion["choices"][0]["text"]

    self.model.reset()

    return result

prepare_generation_parameters(generation_parameters, sampling_parameters, structure_logits_processor, **llama_cpp_params)

Prepare the generation parameters.

llama-cpp-python uses different default values

Source code in outlines/models/llamacpp.py
def prepare_generation_parameters(
    self,
    generation_parameters: GenerationParameters,
    sampling_parameters: SamplingParameters,
    structure_logits_processor,
    **llama_cpp_params: Unpack[LlamaCppParams],
):
    """Prepare the generation parameters.

    `llama-cpp-python` uses different default values

    """
    from llama_cpp import LogitsProcessorList

    max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)

    # We update `llama_cpp_params` with the values the user passed to the
    # generator.
    if "stop" not in llama_cpp_params:
        llama_cpp_params["stop"] = stop_at
    if "seed" not in llama_cpp_params:
        llama_cpp_params["seed"] = seed

    # Somehow `llama-cpp-python` generates `max_tokens + 1`  tokens
    if "max_tokens" not in llama_cpp_params:
        if max_tokens is None:
            llama_cpp_params["max_tokens"] = -1  # indicates unlimited tokens
        else:
            llama_cpp_params["max_tokens"] = max_tokens - 1
    else:
        llama_cpp_params["max_tokens"] = llama_cpp_params["max_tokens"] - 1

    sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
        sampling_parameters
    )

    # We update the `llama_cpp_params` with the sampling values that
    # were specified by the user via the `Sampler` class, unless they
    # are also specified in `llama_cpp_params`. We also disable other
    # sampling methods that are enabled by default and reset the temperature
    # value.
    #
    # See https://github.com/ggerganov/llama.cpp/blob/e11a8999b5690f810c2c99c14347f0834e68c524/common/sampling.h#L22
    # for the default values in `llama.cpp` and indications to disable the sampling modes.
    # Mirostat sampling, tail-free sampling and all penalties are disabled by default.
    #
    # See https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__
    # for default values in `llama-cpp-python`
    if sampler == "beam_search":
        raise NotImplementedError(
            "The `llama_cpp_python` library does not support Beam Search."
        )
    if num_samples != 1:
        raise NotImplementedError(
            "The `llama_cpp_python` library does not allow to take several samples."
        )
    if "top_p" not in llama_cpp_params:
        if top_p is not None:
            llama_cpp_params["top_p"] = top_p
        else:
            llama_cpp_params["top_p"] = 1.0

    if "min_p" not in llama_cpp_params:
        llama_cpp_params["min_p"] = 0.0

    if "top_k" not in llama_cpp_params:
        if top_k is not None:
            llama_cpp_params["top_k"] = top_k
        else:
            llama_cpp_params["top_k"] = -1

    if "temperature" not in llama_cpp_params:
        if temperature is not None:
            llama_cpp_params["temperature"] = temperature
        else:
            llama_cpp_params["temperature"] = 1.0

    if "repeat_penalty" not in llama_cpp_params:
        llama_cpp_params["repeat_penalty"] = 1.0

    # The choice to stream or not should happen via the high-level API
    llama_cpp_params["stream"] = False

    if structure_logits_processor is not None:
        if "logits_processor" in llama_cpp_params:
            llama_cpp_params["logits_processor"].append(structure_logits_processor)
        else:
            llama_cpp_params["logits_processor"] = LogitsProcessorList(
                [structure_logits_processor]
            )

    return llama_cpp_params

stream(prompts, generation_parameters, structure_logits_processor, sampling_parameters, **llama_cpp_params)

Stream text using llama-cpp-python.

Parameters:

Name Type Description Default
prompts Union[str, List[str]]

A prompt or list of prompts.

required
generation_parameters GenerationParameters

An instance of GenerationParameters that contains the prompt, the maximum number of tokens, stop sequences and seed. All the arguments to SequenceGeneratorAdapter's __cal__ method.

required
logits_processor

The logits processor to use when generating text.

required
sampling_parameters SamplingParameters

An instance of SamplingParameters, a dataclass that contains the name of the sampler to use and related parameters as available in Outlines.

required
llama_cpp_params Unpack[LlamaCppParams]

Keyword arguments that can be passed to llama_cpp_python.Llama.__call__. The values in llama_cpp_params supersede the values of the parameters in generation_parameters and sampling_parameters. See the llama_cpp_python documentation for a list of possible values: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.call

{}

Returns:

Type Description
A generator that return strings.
Source code in outlines/models/llamacpp.py
def stream(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    structure_logits_processor,
    sampling_parameters: SamplingParameters,
    **llama_cpp_params: Unpack[LlamaCppParams],
) -> Iterator[str]:
    """Stream text using `llama-cpp-python`.

    Parameters
    ----------
    prompts
        A prompt or list of prompts.
    generation_parameters
        An instance of `GenerationParameters` that contains the prompt,
        the maximum number of tokens, stop sequences and seed. All the
        arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
    logits_processor
        The logits processor to use when generating text.
    sampling_parameters
        An instance of `SamplingParameters`, a dataclass that contains
        the name of the sampler to use and related parameters as available
        in Outlines.
    llama_cpp_params
        Keyword arguments that can be passed to
        `llama_cpp_python.Llama.__call__`.  The values in `llama_cpp_params`
        supersede the values of the parameters in `generation_parameters` and
        `sampling_parameters`.  See the `llama_cpp_python` documentation for
        a list of possible values: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__

    Returns
    -------
    A generator that return strings.

    """

    if not isinstance(prompts, str):
        raise NotImplementedError(
            "The `llama-cpp-python` library does not support batch inference."
        )

    llama_cpp_params = self.prepare_generation_parameters(
        generation_parameters,
        sampling_parameters,
        structure_logits_processor,
        **llama_cpp_params,
    )
    llama_cpp_params["stream"] = True
    generator = self.model(prompts, **llama_cpp_params)

    def token_generator() -> Iterator[str]:
        while True:
            try:
                result = next(generator)
                yield result["choices"][0]["text"]
            except StopIteration:
                self.model.reset()
                return

    return token_generator()

LlamaCppTokenizer

Bases: Tokenizer

Source code in outlines/models/llamacpp.py
class LlamaCppTokenizer(Tokenizer):
    def __init__(self, model: "Llama"):
        self.eos_token_id = model.token_eos()
        self.eos_token = model.tokenizer().decode([self.eos_token_id])
        self.pad_token_id = self.eos_token_id
        self.special_tokens: Set[str] = set()

        self.vocabulary: Dict[str, int] = dict()

        self.tokenizer = model.tokenizer()

        # TODO: Remove when https://github.com/ggerganov/llama.cpp/pull/5613 is resolved
        self._hf_tokenizer = None
        try:
            self.vocabulary = model.tokenizer_.hf_tokenizer.get_vocab()
            self._hf_tokenizer = model.tokenizer_.hf_tokenizer
        except AttributeError:
            # ###
            for t in range(model.n_vocab()):
                token_piece = model.tokenizer().decode([t])
                self.vocabulary[token_piece] = t

        # ensure stable ordering of vocabulary
        self.vocabulary = {
            tok: tok_id
            for tok, tok_id in sorted(self.vocabulary.items(), key=lambda x: x[1])
        }

        self._hash = None

    def decode(self, token_ids: List[int]) -> List[str]:
        decoded_bytes = self.tokenizer.detokenize(token_ids)
        return [decoded_bytes.decode("utf-8", errors="ignore")]

    def encode(
        self, prompt: Union[str, List[str]], add_bos: bool = True, special: bool = True
    ) -> Tuple[List[int], List[int]]:
        if isinstance(prompt, list):
            raise NotImplementedError(
                "llama-cpp-python tokenizer doesn't support batch tokenization"
            )
        token_ids = self.tokenizer.tokenize(
            prompt.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
        )
        # generate attention mask, missing from llama-cpp-python
        attention_mask = [
            1 if token_id != self.pad_token_id else 0 for token_id in token_ids
        ]
        return token_ids, attention_mask

    def convert_token_to_string(self, token: str) -> str:
        if self._hf_tokenizer is not None:
            from transformers.file_utils import SPIECE_UNDERLINE

            token_str = self._hf_tokenizer.convert_tokens_to_string([token])
            if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
                token_str = " " + token_str
            return token_str
        else:
            return token

    def __eq__(self, other):
        if not isinstance(other, LlamaCppTokenizer):
            return False
        return self.__getstate__() == other.__getstate__()

    def __hash__(self):
        if self._hash is None:
            self._hash = hash(pickle.dumps(self))
        return self._hash

    def __getstate__(self):
        """Create a stable representation for outlines.caching"""
        return (
            self.vocabulary,
            self.eos_token_id,
            self.eos_token,
            self.pad_token_id,
            sorted(self.special_tokens),
        )

    def __setstate__(self, state):
        raise NotImplementedError("Cannot load a pickled llamacpp tokenizer")

__getstate__()

Create a stable representation for outlines.caching

Source code in outlines/models/llamacpp.py
def __getstate__(self):
    """Create a stable representation for outlines.caching"""
    return (
        self.vocabulary,
        self.eos_token_id,
        self.eos_token,
        self.pad_token_id,
        sorted(self.special_tokens),
    )

llamacpp(repo_id, filename=None, **llamacpp_model_params)

Load a model from the llama-cpp-python library.

We use the Llama.from_pretrained classmethod that downloads models directly from the HuggingFace hub, instead of asking users to specify a path to the downloaded model. One can still load a local model by initializing llama_cpp.Llama directly.

Parameters:

Name Type Description Default
repo_id str

The name of the model repository.

required
filename Optional[str]

A filename of glob pattern to match the model file in the repo.

None
llama_cpp_model_params

Llama-specific model parameters. See the llama-cpp-python documentation for the full list: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.init

required
Source code in outlines/models/llamacpp.py
def llamacpp(
    repo_id: str, filename: Optional[str] = None, **llamacpp_model_params
) -> LlamaCpp:
    """Load a model from the `llama-cpp-python` library.

    We use the `Llama.from_pretrained` classmethod that downloads models
    directly from the HuggingFace hub, instead of asking users to specify
    a path to the downloaded model. One can still load a local model
    by initializing `llama_cpp.Llama` directly.

    Parameters
    ----------
    repo_id
        The name of the model repository.
    filename:
        A filename of glob pattern to match the model file in the repo.
    llama_cpp_model_params
        Llama-specific model parameters. See the `llama-cpp-python` documentation
        for the full list: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__

    """
    from llama_cpp import Llama

    # Default to using the model's full context length
    if "n_ctx" not in llamacpp_model_params:
        llamacpp_model_params["n_ctx"] = 0

    if "verbose" not in llamacpp_model_params:
        llamacpp_model_params["verbose"] = False

    # TODO: Remove when https://github.com/ggerganov/llama.cpp/pull/5613 is resolved
    if "tokenizer" not in llamacpp_model_params:
        warnings.warn(
            "The pre-tokenizer in `llama.cpp` handles unicode improperly "
            + "(https://github.com/ggerganov/llama.cpp/pull/5613)\n"
            + "Outlines may raise a `RuntimeError` when building the regex index.\n"
            + "To circumvent this error when using `models.llamacpp()` you may pass the argument"
            + "`tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(<hf_repo_id>)`\n"
        )

    model = Llama.from_pretrained(repo_id, filename, **llamacpp_model_params)

    return LlamaCpp(model)

mlxlm

MLXLM

Represents an mlx_lm model

Source code in outlines/models/mlxlm.py
class MLXLM:
    """
    Represents an `mlx_lm` model
    """

    def __init__(
        self,
        model: "nn.Module",
        tokenizer: "PreTrainedTokenizer",
    ):
        self.model = model
        self.mlx_tokenizer = tokenizer  # returns mlx tensors, used for encode()
        self.tokenizer = TransformerTokenizer(
            tokenizer._tokenizer
        )  # _tokenizer is HF Tokenizer

    def generate(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: "GenerationParameters",
        logits_processor,
        sampling_parameters: "SamplingParameters",
    ) -> str:
        streamer = self.stream(
            prompts, generation_parameters, logits_processor, sampling_parameters
        )
        return "".join(list(streamer))

    def stream(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: "GenerationParameters",
        logits_processor,
        sampling_parameters: "SamplingParameters",
    ) -> Iterator[str]:
        """Generate text using `mlx_lm`.

        Parameters
        ----------
        prompts
            A prompt or list of prompts.
        generation_parameters
            An instance of `GenerationParameters` that contains the prompt,
            the maximum number of tokens, stop sequences and seed. All the
            arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
        logits_processor
            The logits processor to use when generating text.
        sampling_parameters
            An instance of `SamplingParameters`, a dataclass that contains
            the name of the sampler to use and related parameters as available
            in Outlines.

        Returns
        -------
        The generated text.
        """
        import mlx.core as mx

        max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)
        sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
            sampling_parameters
        )
        if max_tokens is None:
            max_tokens = int(1e9)

        if not isinstance(prompts, str):
            raise NotImplementedError(
                "The `mlx-lm` library does not support batch inference."
            )
        if sampler == "beam_search":
            raise NotImplementedError(
                "The `mlx-lm` library does not support Beam Search."
            )
        if num_samples != 1:
            raise NotImplementedError(
                "The `mlx-lm` library does not allow to take several samples."
            )
        if top_k is not None:
            raise NotImplementedError("The `mlx-lm` library does not support top_k.")
        if seed is not None:
            raise NotImplementedError("The `mlx-lm` library does not support seed.")
        if stop_at is not None:
            raise NotImplementedError("The `mlx-lm` library does not support stop_at.")

        generate_kwargs = {
            "temp": temperature,
            "top_p": top_p,
            "sampler": sampler,
            "logits_processor": logits_processor,
        }

        # Adapted from
        # https://github.com/ml-explore/mlx-examples/blob/4872727/llms/mlx_lm/utils.py#L267
        prompt_tokens = mx.array(self.mlx_tokenizer.encode(prompts))

        detokenizer = self.mlx_tokenizer.detokenizer
        detokenizer.reset()

        for (token, prob), n in zip(
            self.generate_step(prompt_tokens, **generate_kwargs),
            range(max_tokens),
        ):
            if token == self.tokenizer.eos_token_id:
                break
            detokenizer.add_token(token)
            yield detokenizer.last_segment

        detokenizer.finalize()
        yield detokenizer.last_segment

    def generate_step(
        self,
        prompt: "mx.array",
        temp: Optional[float],
        top_p: Optional[float],
        sampler: str,
        logits_processor: "OutlinesLogitsProcessor",
    ) -> Generator[Tuple[int, float], None, None]:
        """
        Adapted from
        https://github.com/ml-explore/mlx-examples/blob/4872727/llms/mlx_lm/utils.py#L129

        A generator producing token ids based on the given prompt from the model.

        Parameters
        ----------
        prompt
            The input prompt.
        temp
            The temperature for sampling, if 0 the argmax is used.
        top_p
            Nulceus sampling, higher means model considers more less likely words.
        sampler
            The sampler string defined by SequenceGeneratorAdapter
        logits_processor
            Augment logits before sampling.
        """
        import mlx.core as mx
        import mlx_lm

        temperature: float = temp or 1.0

        def sample(logits: "mx.array") -> Tuple["mx.array", float]:
            softmax_logits = mx.softmax(logits)

            if temperature == 0.0 or sampler == "greedy":
                token = mx.argmax(logits, axis=-1)
            elif sampler == "multinomial":
                if top_p is not None and top_p > 0 and top_p < 1.0:
                    token = mlx_lm.sample_utils.top_p_sampling(
                        logits, top_p, temperature
                    )
                else:
                    token = mx.random.categorical(logits * (1 / temperature))
            else:
                raise ValueError(f"Invalid mlx-lm sampler: `{sampler}`")

            prob = softmax_logits[0, token]
            return token, prob

        cache = mlx_lm.models.cache.make_prompt_cache(self.model)

        # kv cache contains processed input IDs, we pass the unprocessed inputs and cache to model()
        unprocessed_input_ids = prompt
        generated_ids: List[int] = []

        while True:
            logits = self.model(unprocessed_input_ids[None], cache=cache)
            logits = logits[:, -1, :]

            if logits_processor is not None:
                # convert to logits_processor 1d expectation, apply, then convert back
                logits_1d = logits.reshape(-1)
                logits_1d = logits_processor(generated_ids, logits_1d)
                logits = logits_1d.reshape(1, -1)

            new_token_single, prob = sample(logits)
            new_token = new_token_single.item()
            yield new_token, prob

            generated_ids.append(new_token)
            unprocessed_input_ids = new_token_single

generate_step(prompt, temp, top_p, sampler, logits_processor)

Adapted from https://github.com/ml-explore/mlx-examples/blob/4872727/llms/mlx_lm/utils.py#L129

A generator producing token ids based on the given prompt from the model.

Parameters:

Name Type Description Default
prompt array

The input prompt.

required
temp Optional[float]

The temperature for sampling, if 0 the argmax is used.

required
top_p Optional[float]

Nulceus sampling, higher means model considers more less likely words.

required
sampler str

The sampler string defined by SequenceGeneratorAdapter

required
logits_processor OutlinesLogitsProcessor

Augment logits before sampling.

required
Source code in outlines/models/mlxlm.py
def generate_step(
    self,
    prompt: "mx.array",
    temp: Optional[float],
    top_p: Optional[float],
    sampler: str,
    logits_processor: "OutlinesLogitsProcessor",
) -> Generator[Tuple[int, float], None, None]:
    """
    Adapted from
    https://github.com/ml-explore/mlx-examples/blob/4872727/llms/mlx_lm/utils.py#L129

    A generator producing token ids based on the given prompt from the model.

    Parameters
    ----------
    prompt
        The input prompt.
    temp
        The temperature for sampling, if 0 the argmax is used.
    top_p
        Nulceus sampling, higher means model considers more less likely words.
    sampler
        The sampler string defined by SequenceGeneratorAdapter
    logits_processor
        Augment logits before sampling.
    """
    import mlx.core as mx
    import mlx_lm

    temperature: float = temp or 1.0

    def sample(logits: "mx.array") -> Tuple["mx.array", float]:
        softmax_logits = mx.softmax(logits)

        if temperature == 0.0 or sampler == "greedy":
            token = mx.argmax(logits, axis=-1)
        elif sampler == "multinomial":
            if top_p is not None and top_p > 0 and top_p < 1.0:
                token = mlx_lm.sample_utils.top_p_sampling(
                    logits, top_p, temperature
                )
            else:
                token = mx.random.categorical(logits * (1 / temperature))
        else:
            raise ValueError(f"Invalid mlx-lm sampler: `{sampler}`")

        prob = softmax_logits[0, token]
        return token, prob

    cache = mlx_lm.models.cache.make_prompt_cache(self.model)

    # kv cache contains processed input IDs, we pass the unprocessed inputs and cache to model()
    unprocessed_input_ids = prompt
    generated_ids: List[int] = []

    while True:
        logits = self.model(unprocessed_input_ids[None], cache=cache)
        logits = logits[:, -1, :]

        if logits_processor is not None:
            # convert to logits_processor 1d expectation, apply, then convert back
            logits_1d = logits.reshape(-1)
            logits_1d = logits_processor(generated_ids, logits_1d)
            logits = logits_1d.reshape(1, -1)

        new_token_single, prob = sample(logits)
        new_token = new_token_single.item()
        yield new_token, prob

        generated_ids.append(new_token)
        unprocessed_input_ids = new_token_single

stream(prompts, generation_parameters, logits_processor, sampling_parameters)

Generate text using mlx_lm.

Parameters:

Name Type Description Default
prompts Union[str, List[str]]

A prompt or list of prompts.

required
generation_parameters GenerationParameters

An instance of GenerationParameters that contains the prompt, the maximum number of tokens, stop sequences and seed. All the arguments to SequenceGeneratorAdapter's __cal__ method.

required
logits_processor

The logits processor to use when generating text.

required
sampling_parameters SamplingParameters

An instance of SamplingParameters, a dataclass that contains the name of the sampler to use and related parameters as available in Outlines.

required

Returns:

Type Description
The generated text.
Source code in outlines/models/mlxlm.py
def stream(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: "GenerationParameters",
    logits_processor,
    sampling_parameters: "SamplingParameters",
) -> Iterator[str]:
    """Generate text using `mlx_lm`.

    Parameters
    ----------
    prompts
        A prompt or list of prompts.
    generation_parameters
        An instance of `GenerationParameters` that contains the prompt,
        the maximum number of tokens, stop sequences and seed. All the
        arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
    logits_processor
        The logits processor to use when generating text.
    sampling_parameters
        An instance of `SamplingParameters`, a dataclass that contains
        the name of the sampler to use and related parameters as available
        in Outlines.

    Returns
    -------
    The generated text.
    """
    import mlx.core as mx

    max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)
    sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
        sampling_parameters
    )
    if max_tokens is None:
        max_tokens = int(1e9)

    if not isinstance(prompts, str):
        raise NotImplementedError(
            "The `mlx-lm` library does not support batch inference."
        )
    if sampler == "beam_search":
        raise NotImplementedError(
            "The `mlx-lm` library does not support Beam Search."
        )
    if num_samples != 1:
        raise NotImplementedError(
            "The `mlx-lm` library does not allow to take several samples."
        )
    if top_k is not None:
        raise NotImplementedError("The `mlx-lm` library does not support top_k.")
    if seed is not None:
        raise NotImplementedError("The `mlx-lm` library does not support seed.")
    if stop_at is not None:
        raise NotImplementedError("The `mlx-lm` library does not support stop_at.")

    generate_kwargs = {
        "temp": temperature,
        "top_p": top_p,
        "sampler": sampler,
        "logits_processor": logits_processor,
    }

    # Adapted from
    # https://github.com/ml-explore/mlx-examples/blob/4872727/llms/mlx_lm/utils.py#L267
    prompt_tokens = mx.array(self.mlx_tokenizer.encode(prompts))

    detokenizer = self.mlx_tokenizer.detokenizer
    detokenizer.reset()

    for (token, prob), n in zip(
        self.generate_step(prompt_tokens, **generate_kwargs),
        range(max_tokens),
    ):
        if token == self.tokenizer.eos_token_id:
            break
        detokenizer.add_token(token)
        yield detokenizer.last_segment

    detokenizer.finalize()
    yield detokenizer.last_segment

mlxlm(model_name, tokenizer_config={}, model_config={}, adapter_path=None, lazy=False)

Instantiate a model from the mlx_lm library and its tokenizer.

Signature adapted from https://github.com/ml-explore/mlx-examples/blob/4872727/llms/mlx_lm/utils.py#L422

Parameters:

Name Type Description Default
Args

path_or_hf_repo (Path): The path or the huggingface repository to load the model from. tokenizer_config (dict, optional): Configuration parameters specifically for the tokenizer. Defaults to an empty dictionary. model_config(dict, optional): Configuration parameters specifically for the model. Defaults to an empty dictionary. adapter_path (str, optional): Path to the LoRA adapters. If provided, applies LoRA layers to the model. Default: None. lazy (bool): If False eval the model parameters to make sure they are loaded in memory before returning, otherwise they will be loaded when needed. Default: False

required

Returns:

Type Description
A `MLXLM` model instance.
Source code in outlines/models/mlxlm.py
def mlxlm(
    model_name: str,
    tokenizer_config: dict = {},
    model_config: dict = {},
    adapter_path: Optional[str] = None,
    lazy: bool = False,
):
    """Instantiate a model from the `mlx_lm` library and its tokenizer.

    Signature adapted from
    https://github.com/ml-explore/mlx-examples/blob/4872727/llms/mlx_lm/utils.py#L422

    Parameters
    ----------
    Args:
        path_or_hf_repo (Path): The path or the huggingface repository to load the model from.
        tokenizer_config (dict, optional): Configuration parameters specifically for the tokenizer.
            Defaults to an empty dictionary.
        model_config(dict, optional): Configuration parameters specifically for the model.
            Defaults to an empty dictionary.
        adapter_path (str, optional): Path to the LoRA adapters. If provided, applies LoRA layers
            to the model. Default: ``None``.
        lazy (bool): If False eval the model parameters to make sure they are
            loaded in memory before returning, otherwise they will be loaded
            when needed. Default: ``False``

    Returns
    -------
    A `MLXLM` model instance.

    """
    try:
        import mlx.core as mx
        import mlx_lm
    except ImportError:
        raise ImportError(
            "The `mlx_lm` library needs to be installed in order to use `mlx_lm` models."
        )
    if not mx.metal.is_available():
        raise RuntimeError("You cannot use `mlx_lm` without Apple Silicon (Metal)")

    model, tokenizer = mlx_lm.load(
        model_name,
        tokenizer_config=tokenizer_config,
        model_config=model_config,
        adapter_path=adapter_path,
        lazy=lazy,
    )
    return MLXLM(model, tokenizer)

openai

Integration with OpenAI's API.

OpenAI

An object that represents the OpenAI API.

Source code in outlines/models/openai.py
class OpenAI:
    """An object that represents the OpenAI API."""

    def __init__(
        self,
        client,
        config,
        system_prompt: Optional[str] = None,
    ):
        """Create an `OpenAI` instance.

        This class supports the standard OpenAI API, the Azure OpeanAI API as
        well as compatible APIs that rely on the OpenAI client.

        Parameters
        ----------
        client
            An instance of the API's async client.
        config
            An instance of `OpenAIConfig`. Can be useful to specify some
            parameters that cannot be set by calling this class' methods.
        """

        self.client = client
        self.config = config

        # We count the total number of prompt and generated tokens as returned
        # by the OpenAI API, summed over all the requests performed with this
        # model instance.
        self.prompt_tokens = 0
        self.completion_tokens = 0

        self.format_sequence = lambda seq: seq

    def __call__(
        self,
        prompt: Union[str, List[str]],
        max_tokens: Optional[int] = None,
        stop_at: Optional[Union[List[str], str]] = None,
        *,
        system_prompt: Optional[str] = None,
        temperature: Optional[float] = None,
        samples: Optional[int] = None,
    ) -> np.ndarray:
        """Call the OpenAI API to generate text.

        Parameters
        ----------
        prompt
            A string or list of strings that will be used to prompt the model
        max_tokens
            The maximum number of tokens to generate
        stop_at
            A string or array of strings which, such that the generation stops
            when they are generated.
        system_prompt
            The content of the system message that precedes the user's prompt.
        temperature
            The value of the temperature used to sample tokens
        samples
            The number of completions to generate for each prompt
        stop_at
            Up to 4 words where the API will stop the completion.

        """
        if max_tokens is None:
            max_tokens = self.config.max_tokens
        if stop_at is None:
            stop_at = self.config.stop
        if temperature is None:
            temperature = self.config.temperature
        if samples is None:
            samples = self.config.n

        config = replace(
            self.config,
            max_tokens=max_tokens,
            temperature=temperature,
            n=samples,
            stop=stop_at,
        )  # type: ignore

        response, prompt_tokens, completion_tokens = generate_chat(
            prompt, system_prompt, self.client, config
        )
        self.prompt_tokens += prompt_tokens
        self.completion_tokens += completion_tokens

        return self.format_sequence(response)

    def stream(self, *args, **kwargs):
        raise NotImplementedError(
            "Streaming is currently not supported for the OpenAI API"
        )

    def new_with_replacements(self, **kwargs):
        new_instance = copy.copy(self)
        new_instance.config = replace(new_instance.config, **kwargs)
        return new_instance

    def __str__(self):
        return self.__class__.__name__ + " API"

    def __repr__(self):
        return str(self.config)

__call__(prompt, max_tokens=None, stop_at=None, *, system_prompt=None, temperature=None, samples=None)

Call the OpenAI API to generate text.

Parameters:

Name Type Description Default
prompt Union[str, List[str]]

A string or list of strings that will be used to prompt the model

required
max_tokens Optional[int]

The maximum number of tokens to generate

None
stop_at Optional[Union[List[str], str]]

A string or array of strings which, such that the generation stops when they are generated.

None
system_prompt Optional[str]

The content of the system message that precedes the user's prompt.

None
temperature Optional[float]

The value of the temperature used to sample tokens

None
samples Optional[int]

The number of completions to generate for each prompt

None
stop_at Optional[Union[List[str], str]]

Up to 4 words where the API will stop the completion.

None
Source code in outlines/models/openai.py
def __call__(
    self,
    prompt: Union[str, List[str]],
    max_tokens: Optional[int] = None,
    stop_at: Optional[Union[List[str], str]] = None,
    *,
    system_prompt: Optional[str] = None,
    temperature: Optional[float] = None,
    samples: Optional[int] = None,
) -> np.ndarray:
    """Call the OpenAI API to generate text.

    Parameters
    ----------
    prompt
        A string or list of strings that will be used to prompt the model
    max_tokens
        The maximum number of tokens to generate
    stop_at
        A string or array of strings which, such that the generation stops
        when they are generated.
    system_prompt
        The content of the system message that precedes the user's prompt.
    temperature
        The value of the temperature used to sample tokens
    samples
        The number of completions to generate for each prompt
    stop_at
        Up to 4 words where the API will stop the completion.

    """
    if max_tokens is None:
        max_tokens = self.config.max_tokens
    if stop_at is None:
        stop_at = self.config.stop
    if temperature is None:
        temperature = self.config.temperature
    if samples is None:
        samples = self.config.n

    config = replace(
        self.config,
        max_tokens=max_tokens,
        temperature=temperature,
        n=samples,
        stop=stop_at,
    )  # type: ignore

    response, prompt_tokens, completion_tokens = generate_chat(
        prompt, system_prompt, self.client, config
    )
    self.prompt_tokens += prompt_tokens
    self.completion_tokens += completion_tokens

    return self.format_sequence(response)

__init__(client, config, system_prompt=None)

Create an OpenAI instance.

This class supports the standard OpenAI API, the Azure OpeanAI API as well as compatible APIs that rely on the OpenAI client.

Parameters:

Name Type Description Default
client

An instance of the API's async client.

required
config

An instance of OpenAIConfig. Can be useful to specify some parameters that cannot be set by calling this class' methods.

required
Source code in outlines/models/openai.py
def __init__(
    self,
    client,
    config,
    system_prompt: Optional[str] = None,
):
    """Create an `OpenAI` instance.

    This class supports the standard OpenAI API, the Azure OpeanAI API as
    well as compatible APIs that rely on the OpenAI client.

    Parameters
    ----------
    client
        An instance of the API's async client.
    config
        An instance of `OpenAIConfig`. Can be useful to specify some
        parameters that cannot be set by calling this class' methods.
    """

    self.client = client
    self.config = config

    # We count the total number of prompt and generated tokens as returned
    # by the OpenAI API, summed over all the requests performed with this
    # model instance.
    self.prompt_tokens = 0
    self.completion_tokens = 0

    self.format_sequence = lambda seq: seq

OpenAIConfig dataclass

Represents the parameters of the OpenAI API.

The information was last fetched on 2023/11/20. We document below the properties that are specific to the OpenAI API. Not all these properties are supported by Outlines.

Parameters:

Name Type Description Default
model str

The name of the model. Available models can be found on OpenAI's website.

''
frequency_penalty float

Number between 2.0 and -2.0. Positive values penalize new tokens based on their existing frequency in the text,

0
logit_bias Dict[int, int]

Modifies the likelihood of specified tokens to appear in the completion. Number between -100 (forbid) and +100 (only allows).

dict()
n int

The number of completions to return for each prompt.

1
presence_penalty float

Similar to frequency penalty.

0
response_format Optional[Dict[str, str]]

Specifies the format the model must output. {"type": "json_object"} enables JSON mode.

None
seed Optional[int]

Two completions with the same seed value should return the same completion. This is however not guaranteed.

None
stop Optional[Union[str, List[str]]]

Up to 4 words where the API will stop the completion.

None
temperature float

Number between 0 and 2. Higher values make the output more random, while lower values make it more deterministic.

1.0
top_p int

Number between 0 and 1. Parameter for nucleus sampling.

1
user str

A unique identifier for the end-user.

str()
Source code in outlines/models/openai.py
@dataclass(frozen=True)
class OpenAIConfig:
    """Represents the parameters of the OpenAI API.

    The information was last fetched on 2023/11/20. We document below the
    properties that are specific to the OpenAI API. Not all these properties are
    supported by Outlines.

    Parameters
    ----------
    model
        The name of the model. Available models can be found on OpenAI's website.
    frequency_penalty
        Number between 2.0 and -2.0. Positive values penalize new tokens based on
        their existing frequency in the text,
    logit_bias
        Modifies the likelihood of specified tokens to appear in the completion.
        Number between -100 (forbid) and +100 (only allows).
    n
        The number of completions to return for each prompt.
    presence_penalty
        Similar to frequency penalty.
    response_format
        Specifies the format the model must output. `{"type": "json_object"}`
        enables JSON mode.
    seed
        Two completions with the same `seed` value should return the same
        completion. This is however not guaranteed.
    stop
        Up to 4 words where the API will stop the completion.
    temperature
        Number between 0 and 2. Higher values make the output more random, while
        lower values make it more deterministic.
    top_p
        Number between 0 and 1. Parameter for nucleus sampling.
    user
        A unique identifier for the end-user.
    """

    model: str = ""
    frequency_penalty: float = 0
    logit_bias: Dict[int, int] = field(default_factory=dict)
    max_tokens: Optional[int] = None
    n: int = 1
    presence_penalty: float = 0
    response_format: Optional[Dict[str, str]] = None
    seed: Optional[int] = None
    stop: Optional[Union[str, List[str]]] = None
    temperature: float = 1.0
    top_p: int = 1
    user: str = field(default_factory=str)

error_handler(api_call_fn)

Handle OpenAI API errors and missing API key.

Source code in outlines/models/openai.py
def error_handler(api_call_fn: Callable) -> Callable:
    """Handle OpenAI API errors and missing API key."""

    def call(*args, **kwargs):
        import openai

        try:
            return api_call_fn(*args, **kwargs)
        except (
            openai.APITimeoutError,
            openai.InternalServerError,
            openai.RateLimitError,
        ) as e:
            raise OSError(f"Could not connect to the OpenAI API: {e}")
        except (
            openai.AuthenticationError,
            openai.BadRequestError,
            openai.ConflictError,
            openai.PermissionDeniedError,
            openai.NotFoundError,
            openai.UnprocessableEntityError,
        ) as e:
            raise e

    return call

generate_chat(prompt, system_prompt, client, config) async

Call OpenAI's Chat Completion API.

Parameters:

Name Type Description Default
prompt str

The prompt we use to start the generation. Passed to the model with the "user" role.

required
system_prompt Union[str, None]

The system prompt, passed to the model with the "system" role before the prompt.

required
client

The API client

required
config OpenAIConfig

An OpenAIConfig instance.

required

Returns:

Type Description
A tuple that contains the model's response(s) and usage statistics.
Source code in outlines/models/openai.py
@functools.partial(vectorize, signature="(),(),(),()->(s),(),()")
async def generate_chat(
    prompt: str,
    system_prompt: Union[str, None],
    client,
    config: OpenAIConfig,
) -> Tuple[np.ndarray, int, int]:
    """Call OpenAI's Chat Completion API.

    Parameters
    ----------
    prompt
        The prompt we use to start the generation. Passed to the model
        with the "user" role.
    system_prompt
        The system prompt, passed to the model with the "system" role
        before the prompt.
    client
        The API client
    config
        An `OpenAIConfig` instance.

    Returns
    -------
    A tuple that contains the model's response(s) and usage statistics.

    """

    @error_handler
    @cache()
    async def call_api(prompt, system_prompt, config):
        responses = await client.chat.completions.create(
            messages=system_message + user_message,
            **asdict(config),  # type: ignore
        )
        return responses.model_dump()

    system_message = (
        [{"role": "system", "content": system_prompt}] if system_prompt else []
    )
    user_message = [{"role": "user", "content": prompt}]

    responses = await call_api(prompt, system_prompt, config)

    results = np.array(
        [responses["choices"][i]["message"]["content"] for i in range(config.n)]
    )
    usage = responses["usage"]

    return results, usage["prompt_tokens"], usage["completion_tokens"]

tokenizer

Tokenizer

Bases: Hashable, Protocol

Source code in outlines/models/tokenizer.py
class Tokenizer(Hashable, Protocol):
    eos_token: str
    eos_token_id: int
    pad_token_id: int
    vocabulary: Dict[str, int]
    special_tokens: Set[str]

    def encode(
        self, prompt: Union[str, List[str]]
    ) -> Tuple[NDArray[np.int64], NDArray[np.int64]]:
        """Translate the input prompts into arrays of token ids and attention mask."""
        ...

    def decode(self, token_ids: NDArray[np.int64]) -> List[str]:
        """Translate an array of token ids to a string or list of strings."""
        ...

    def convert_token_to_string(self, token: str) -> str:
        """Convert a token to its equivalent string.

        This is for instance useful for BPE tokenizers where whitespaces are
        represented by the special characted `Ġ`. This prevents matching a raw
        token that includes `Ġ` with a string.
        """
        ...

convert_token_to_string(token)

Convert a token to its equivalent string.

This is for instance useful for BPE tokenizers where whitespaces are represented by the special characted Ġ. This prevents matching a raw token that includes Ġ with a string.

Source code in outlines/models/tokenizer.py
def convert_token_to_string(self, token: str) -> str:
    """Convert a token to its equivalent string.

    This is for instance useful for BPE tokenizers where whitespaces are
    represented by the special characted `Ġ`. This prevents matching a raw
    token that includes `Ġ` with a string.
    """
    ...

decode(token_ids)

Translate an array of token ids to a string or list of strings.

Source code in outlines/models/tokenizer.py
def decode(self, token_ids: NDArray[np.int64]) -> List[str]:
    """Translate an array of token ids to a string or list of strings."""
    ...

encode(prompt)

Translate the input prompts into arrays of token ids and attention mask.

Source code in outlines/models/tokenizer.py
def encode(
    self, prompt: Union[str, List[str]]
) -> Tuple[NDArray[np.int64], NDArray[np.int64]]:
    """Translate the input prompts into arrays of token ids and attention mask."""
    ...

transformers

TransformerTokenizer

Bases: Tokenizer

Represents a tokenizer for models in the transformers library.

Source code in outlines/models/transformers.py
class TransformerTokenizer(Tokenizer):
    """Represents a tokenizer for models in the `transformers` library."""

    def __init__(self, tokenizer: "PreTrainedTokenizer", **kwargs):
        self.tokenizer = tokenizer
        self.eos_token_id = self.tokenizer.eos_token_id
        self.eos_token = self.tokenizer.eos_token

        if self.tokenizer.pad_token_id is None:
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
            self.pad_token_id = self.eos_token_id
        else:
            self.pad_token_id = self.tokenizer.pad_token_id
            self.pad_token = self.tokenizer.pad_token

        self.special_tokens = set(self.tokenizer.all_special_tokens)

        self.vocabulary = self.tokenizer.get_vocab()
        self.is_llama = isinstance(self.tokenizer, get_llama_tokenizer_types())

    def encode(
        self, prompt: Union[str, List[str]], **kwargs
    ) -> Tuple["torch.LongTensor", "torch.LongTensor"]:
        kwargs["padding"] = True
        kwargs["return_tensors"] = "pt"
        output = self.tokenizer(prompt, **kwargs)
        return output["input_ids"], output["attention_mask"]

    def decode(self, token_ids: "torch.LongTensor") -> List[str]:
        text = self.tokenizer.batch_decode(token_ids, skip_special_tokens=True)
        return text

    def convert_token_to_string(self, token: str) -> str:
        from transformers.file_utils import SPIECE_UNDERLINE

        string = self.tokenizer.convert_tokens_to_string([token])

        if self.is_llama:
            # A hack to handle missing spaces to HF's Llama tokenizers
            if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
                return " " + string

        return string

    def __eq__(self, other):
        if isinstance(other, type(self)):
            if hasattr(self, "model_name") and hasattr(self, "kwargs"):
                return (
                    other.model_name == self.model_name and other.kwargs == self.kwargs
                )
            else:
                return other.tokenizer == self.tokenizer
        return NotImplemented

    def __hash__(self):
        from datasets.fingerprint import Hasher

        return hash(Hasher.hash(self.tokenizer))

    def __getstate__(self):
        state = {"tokenizer": self.tokenizer}
        return state

    def __setstate__(self, state):
        self.__init__(state["tokenizer"])

Transformers

Represents a transformers model.

Source code in outlines/models/transformers.py
class Transformers:
    """Represents a `transformers` model."""

    def __init__(
        self,
        model: "PreTrainedModel",
        tokenizer: "PreTrainedTokenizer",
    ):
        self.model = model
        self.tokenizer = TransformerTokenizer(tokenizer)

    def forward(
        self,
        input_ids: "torch.LongTensor",
        attention_mask: "torch.LongTensor",
        past_key_values: Optional[Tuple] = None,
    ) -> Tuple["torch.FloatTensor", Optional[KVCacheType]]:
        """Compute a forward pass through the transformer model.

        Parameters
        ----------
        input_ids
            The input token ids.  Must be one or two dimensional.
        attention_mask
            The attention mask.  Must be one or two dimensional.
        past_key_values
            A tuple of tuples containing the cached key and value tensors for each
            attention head.

        Returns
        -------
        The computed logits and the new cached key and value tensors.

        """
        try:
            import torch
        except ImportError:
            ImportError(
                "The `torch` library needs to be installed to use `transformers` models."
            )
        assert 0 < input_ids.ndim < 3

        if past_key_values:
            input_ids = input_ids[..., -1].unsqueeze(-1)

        with torch.inference_mode():
            output = self.model(
                input_ids,
                attention_mask=attention_mask,
                return_dict=True,
                output_attentions=False,
                output_hidden_states=False,
                past_key_values=past_key_values,
            )

        return output.logits, output.past_key_values

    def __call__(
        self,
        input_ids: "torch.LongTensor",
        attention_mask: "torch.LongTensor",
        past_key_values: Optional[Tuple] = None,
    ) -> "torch.FloatTensor":
        logits, kv_cache = self.forward(input_ids, attention_mask, past_key_values)
        next_token_logits = logits[..., -1, :]

        return next_token_logits, kv_cache

    def generate(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> Union[str, List[str], List[List[str]]]:
        """Generate text using `transformers`.

        Parameters
        ----------
        prompts
            A prompt or list of prompts.
        generation_parameters
            An instance of `GenerationParameters` that contains the prompt,
            the maximum number of tokens, stop sequences and seed. All the
            arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
        logits_processor
            The logits processor to use when generating text.
        sampling_parameters
            An instance of `SamplingParameters`, a dataclass that contains
            the name of the sampler to use and related parameters as available
            in Outlines.

        Returns
        -------
        The generated text
        """
        if isinstance(prompts, str):
            # convert to 2d
            input_ids, attention_mask = self.tokenizer.encode([prompts])
        else:
            input_ids, attention_mask = self.tokenizer.encode(prompts)

        inputs = {
            "input_ids": input_ids.to(self.model.device),
            "attention_mask": attention_mask.to(self.model.device),
        }
        if (
            "attention_mask"
            not in inspect.signature(self.model.forward).parameters.keys()
        ):
            del inputs["attention_mask"]

        generation_kwargs = self._get_generation_kwargs(
            prompts,
            generation_parameters,
            logits_processor,
            sampling_parameters,
        )
        generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

        # if single str input and single sample per input, convert to a 1D output
        if isinstance(prompts, str):
            generated_ids = generated_ids.squeeze(0)

        return self._decode_generation(generated_ids)

    def stream(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> Iterator[Union[str, List[str]]]:
        """
        Temporary stream stand-in which implements stream() signature
        and equivalent behaviour but isn't yielded until generation completes.

        TODO: implement following completion of https://github.com/huggingface/transformers/issues/30810
        """
        if isinstance(prompts, str):
            # convert to 2d
            input_ids, attention_mask = self.tokenizer.encode([prompts])
        else:
            input_ids, attention_mask = self.tokenizer.encode(prompts)
        inputs = {
            "input_ids": input_ids.to(self.model.device),
            "attention_mask": attention_mask.to(self.model.device),
        }
        if (
            "attention_mask"
            not in inspect.signature(self.model.forward).parameters.keys()
        ):
            del inputs["attention_mask"]

        generation_kwargs = self._get_generation_kwargs(
            prompts,
            generation_parameters,
            logits_processor,
            sampling_parameters,
        )
        generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

        # if single str input and single sample per input, convert to a 1D output
        if isinstance(prompts, str):
            generated_ids = generated_ids.squeeze(0)

        for i in range(generated_ids.size(-1)):
            output_group_ids = generated_ids.select(-1, i).unsqueeze(-1)
            yield self._decode_generation(output_group_ids)

    def _get_generation_kwargs(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> dict:
        """
        Convert outlines generation parameters into model.generate kwargs
        """
        from transformers import GenerationConfig, LogitsProcessorList, set_seed

        max_new_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)
        sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
            sampling_parameters
        )
        if max_new_tokens is None:
            max_new_tokens = int(2**30)

        # global seed, not desirable
        if seed is not None:
            set_seed(seed)

        if logits_processor is not None:
            logits_processor_list = LogitsProcessorList([logits_processor])
        else:
            logits_processor_list = None

        generation_config = GenerationConfig(
            max_new_tokens=max_new_tokens,
            stop_strings=stop_at,
            num_return_sequences=(num_samples or 1),
            top_p=top_p,
            top_k=top_k,
            temperature=temperature,
            do_sample=(sampler == "multinomial"),
            num_beams=(num_samples if sampler == "beam_search" else 1),
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.pad_token_id,
        )

        return dict(
            logits_processor=logits_processor_list,
            generation_config=generation_config,
            tokenizer=self.tokenizer.tokenizer,
        )

    def _generate_output_seq(
        self, prompts, inputs, generation_config, **generation_kwargs
    ):
        input_ids = inputs["input_ids"]
        output_ids = self.model.generate(
            **inputs, generation_config=generation_config, **generation_kwargs
        )

        # encoder-decoder returns output_ids only, decoder-only returns full seq ids
        if self.model.config.is_encoder_decoder:
            generated_ids = output_ids
        else:
            generated_ids = output_ids[:, input_ids.shape[1] :]

        # if batch list inputs AND multiple samples per input, convert generated_id to 3D view
        num_samples = generation_config.num_return_sequences or 1

        if num_samples > 1 and isinstance(prompts, list):
            batch_size = input_ids.size(0)
            num_return_sequences = generation_config.num_return_sequences or 1
            generated_ids = generated_ids.view(batch_size, num_return_sequences, -1)

        return generated_ids

    def _decode_generation(self, generated_ids: "torch.Tensor"):
        if len(generated_ids.shape) == 1:
            return self.tokenizer.decode([generated_ids])[0]
        elif len(generated_ids.shape) == 2:
            return self.tokenizer.decode(generated_ids)
        elif len(generated_ids.shape) == 3:
            return [
                self.tokenizer.decode(generated_ids[i])
                for i in range(len(generated_ids))
            ]
        else:
            raise TypeError(
                f"Generated outputs aren't 1D, 2D or 3D, but instead are {generated_ids.shape}"
            )

_get_generation_kwargs(prompts, generation_parameters, logits_processor, sampling_parameters)

Convert outlines generation parameters into model.generate kwargs

Source code in outlines/models/transformers.py
def _get_generation_kwargs(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    logits_processor: Optional["OutlinesLogitsProcessor"],
    sampling_parameters: SamplingParameters,
) -> dict:
    """
    Convert outlines generation parameters into model.generate kwargs
    """
    from transformers import GenerationConfig, LogitsProcessorList, set_seed

    max_new_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)
    sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
        sampling_parameters
    )
    if max_new_tokens is None:
        max_new_tokens = int(2**30)

    # global seed, not desirable
    if seed is not None:
        set_seed(seed)

    if logits_processor is not None:
        logits_processor_list = LogitsProcessorList([logits_processor])
    else:
        logits_processor_list = None

    generation_config = GenerationConfig(
        max_new_tokens=max_new_tokens,
        stop_strings=stop_at,
        num_return_sequences=(num_samples or 1),
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        do_sample=(sampler == "multinomial"),
        num_beams=(num_samples if sampler == "beam_search" else 1),
        eos_token_id=self.tokenizer.eos_token_id,
        pad_token_id=self.tokenizer.pad_token_id,
    )

    return dict(
        logits_processor=logits_processor_list,
        generation_config=generation_config,
        tokenizer=self.tokenizer.tokenizer,
    )

forward(input_ids, attention_mask, past_key_values=None)

Compute a forward pass through the transformer model.

Parameters:

Name Type Description Default
input_ids LongTensor

The input token ids. Must be one or two dimensional.

required
attention_mask LongTensor

The attention mask. Must be one or two dimensional.

required
past_key_values Optional[Tuple]

A tuple of tuples containing the cached key and value tensors for each attention head.

None

Returns:

Type Description
The computed logits and the new cached key and value tensors.
Source code in outlines/models/transformers.py
def forward(
    self,
    input_ids: "torch.LongTensor",
    attention_mask: "torch.LongTensor",
    past_key_values: Optional[Tuple] = None,
) -> Tuple["torch.FloatTensor", Optional[KVCacheType]]:
    """Compute a forward pass through the transformer model.

    Parameters
    ----------
    input_ids
        The input token ids.  Must be one or two dimensional.
    attention_mask
        The attention mask.  Must be one or two dimensional.
    past_key_values
        A tuple of tuples containing the cached key and value tensors for each
        attention head.

    Returns
    -------
    The computed logits and the new cached key and value tensors.

    """
    try:
        import torch
    except ImportError:
        ImportError(
            "The `torch` library needs to be installed to use `transformers` models."
        )
    assert 0 < input_ids.ndim < 3

    if past_key_values:
        input_ids = input_ids[..., -1].unsqueeze(-1)

    with torch.inference_mode():
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            return_dict=True,
            output_attentions=False,
            output_hidden_states=False,
            past_key_values=past_key_values,
        )

    return output.logits, output.past_key_values

generate(prompts, generation_parameters, logits_processor, sampling_parameters)

Generate text using transformers.

Parameters:

Name Type Description Default
prompts Union[str, List[str]]

A prompt or list of prompts.

required
generation_parameters GenerationParameters

An instance of GenerationParameters that contains the prompt, the maximum number of tokens, stop sequences and seed. All the arguments to SequenceGeneratorAdapter's __cal__ method.

required
logits_processor Optional[OutlinesLogitsProcessor]

The logits processor to use when generating text.

required
sampling_parameters SamplingParameters

An instance of SamplingParameters, a dataclass that contains the name of the sampler to use and related parameters as available in Outlines.

required

Returns:

Type Description
The generated text
Source code in outlines/models/transformers.py
def generate(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    logits_processor: Optional["OutlinesLogitsProcessor"],
    sampling_parameters: SamplingParameters,
) -> Union[str, List[str], List[List[str]]]:
    """Generate text using `transformers`.

    Parameters
    ----------
    prompts
        A prompt or list of prompts.
    generation_parameters
        An instance of `GenerationParameters` that contains the prompt,
        the maximum number of tokens, stop sequences and seed. All the
        arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
    logits_processor
        The logits processor to use when generating text.
    sampling_parameters
        An instance of `SamplingParameters`, a dataclass that contains
        the name of the sampler to use and related parameters as available
        in Outlines.

    Returns
    -------
    The generated text
    """
    if isinstance(prompts, str):
        # convert to 2d
        input_ids, attention_mask = self.tokenizer.encode([prompts])
    else:
        input_ids, attention_mask = self.tokenizer.encode(prompts)

    inputs = {
        "input_ids": input_ids.to(self.model.device),
        "attention_mask": attention_mask.to(self.model.device),
    }
    if (
        "attention_mask"
        not in inspect.signature(self.model.forward).parameters.keys()
    ):
        del inputs["attention_mask"]

    generation_kwargs = self._get_generation_kwargs(
        prompts,
        generation_parameters,
        logits_processor,
        sampling_parameters,
    )
    generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

    # if single str input and single sample per input, convert to a 1D output
    if isinstance(prompts, str):
        generated_ids = generated_ids.squeeze(0)

    return self._decode_generation(generated_ids)

stream(prompts, generation_parameters, logits_processor, sampling_parameters)

Temporary stream stand-in which implements stream() signature and equivalent behaviour but isn't yielded until generation completes.

TODO: implement following completion of https://github.com/huggingface/transformers/issues/30810

Source code in outlines/models/transformers.py
def stream(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    logits_processor: Optional["OutlinesLogitsProcessor"],
    sampling_parameters: SamplingParameters,
) -> Iterator[Union[str, List[str]]]:
    """
    Temporary stream stand-in which implements stream() signature
    and equivalent behaviour but isn't yielded until generation completes.

    TODO: implement following completion of https://github.com/huggingface/transformers/issues/30810
    """
    if isinstance(prompts, str):
        # convert to 2d
        input_ids, attention_mask = self.tokenizer.encode([prompts])
    else:
        input_ids, attention_mask = self.tokenizer.encode(prompts)
    inputs = {
        "input_ids": input_ids.to(self.model.device),
        "attention_mask": attention_mask.to(self.model.device),
    }
    if (
        "attention_mask"
        not in inspect.signature(self.model.forward).parameters.keys()
    ):
        del inputs["attention_mask"]

    generation_kwargs = self._get_generation_kwargs(
        prompts,
        generation_parameters,
        logits_processor,
        sampling_parameters,
    )
    generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

    # if single str input and single sample per input, convert to a 1D output
    if isinstance(prompts, str):
        generated_ids = generated_ids.squeeze(0)

    for i in range(generated_ids.size(-1)):
        output_group_ids = generated_ids.select(-1, i).unsqueeze(-1)
        yield self._decode_generation(output_group_ids)

get_llama_tokenizer_types()

Get all the Llama tokenizer types/classes that need work-arounds.

When they can't be imported, a dummy class is created.

Source code in outlines/models/transformers.py
def get_llama_tokenizer_types():
    """Get all the Llama tokenizer types/classes that need work-arounds.

    When they can't be imported, a dummy class is created.

    """
    try:
        from transformers.models.llama import LlamaTokenizer
    except ImportError:

        class LlamaTokenizer:  # type: ignore
            pass

    try:
        from transformers.models.llama import LlamaTokenizerFast
    except ImportError:

        class LlamaTokenizerFast:  # type: ignore
            pass

    try:
        from transformers.models.code_llama import CodeLlamaTokenizer
    except ImportError:

        class CodeLlamaTokenizer:  # type: ignore
            pass

    try:
        from transformers.models.code_llama import CodeLlamaTokenizerFast
    except ImportError:

        class CodeLlamaTokenizerFast:  # type: ignore
            pass

    return (
        LlamaTokenizer,
        LlamaTokenizerFast,
        CodeLlamaTokenizer,
        CodeLlamaTokenizerFast,
    )

transformers(model_name, device=None, model_kwargs={}, tokenizer_kwargs={}, model_class=None, tokenizer_class=None)

Instantiate a model from the transformers library and its tokenizer.

Parameters:

Name Type Description Default
model_name str

The name of the model as listed on Hugging Face's model page.

required
device Optional[str]

The device(s) on which the model should be loaded. This overrides the device_map entry in model_kwargs when provided.

None
model_kwargs dict

A dictionary that contains the keyword arguments to pass to the from_pretrained method when loading the model.

{}
tokenizer_kwargs dict

A dictionary that contains the keyword arguments to pass to the from_pretrained method when loading the tokenizer.

{}

Returns:

Type Description
A `TransformersModel` model instance.
Source code in outlines/models/transformers.py
def transformers(
    model_name: str,
    device: Optional[str] = None,
    model_kwargs: dict = {},
    tokenizer_kwargs: dict = {},
    model_class=None,
    tokenizer_class=None,
):
    """Instantiate a model from the `transformers` library and its tokenizer.

    Parameters
    ----------
    model_name
        The name of the model as listed on Hugging Face's model page.
    device
        The device(s) on which the model should be loaded. This overrides
        the `device_map` entry in `model_kwargs` when provided.
    model_kwargs
        A dictionary that contains the keyword arguments to pass to the
        `from_pretrained` method when loading the model.
    tokenizer_kwargs
        A dictionary that contains the keyword arguments to pass to the
        `from_pretrained` method when loading the tokenizer.

    Returns
    -------
    A `TransformersModel` model instance.

    """
    if model_class is None or tokenizer_class is None:
        try:
            from transformers import AutoModelForCausalLM, AutoTokenizer
        except ImportError:
            raise ImportError(
                "The `transformers` library needs to be installed in order to use `transformers` models."
            )
    if model_class is None:
        model_class = AutoModelForCausalLM
    if tokenizer_class is None:
        tokenizer_class = AutoTokenizer

    if device is not None:
        model_kwargs["device_map"] = device

    model = model_class.from_pretrained(model_name, **model_kwargs)

    tokenizer_kwargs.setdefault("padding_side", "left")
    tokenizer = tokenizer_class.from_pretrained(model_name, **tokenizer_kwargs)

    return Transformers(model, tokenizer)

transformers_vision

TransformersVision

Bases: Transformers

Source code in outlines/models/transformers_vision.py
class TransformersVision(Transformers):
    def __init__(self, model, tokenizer, processor):
        super().__init__(model, tokenizer)
        self.processor = processor

    def generate(  # type: ignore
        self,
        prompts: Union[str, List[str]],
        media: Union[List[Any], List[List[Any]]],
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> Union[str, List[str], List[List[str]]]:
        """Generate text using `transformers`.

        Parameters
        ----------
        prompts
            A prompt or list of prompts.
        media
            A List[PIL.Image] or List[List[PIL.Image]]
        generation_parameters
            An instance of `GenerationParameters` that contains the prompt,
            the maximum number of tokens, stop sequences and seed. All the
            arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
        logits_processor
            The logits processor to use when generating text.
        sampling_parameters
            An instance of `SamplingParameters`, a dataclass that contains
            the name of the sampler to use and related parameters as available
            in Outlines.

        Returns
        -------
        The generated text
        """
        inputs = self.processor(
            text=prompts, images=media, padding=True, return_tensors="pt"
        ).to(self.model.device)

        generation_kwargs = self._get_generation_kwargs(
            prompts,
            generation_parameters,
            logits_processor,
            sampling_parameters,
        )
        generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

        # if single str input and single sample per input, convert to a 1D output
        if isinstance(prompts, str):
            # Should always be true until NotImplementedError above is fixed
            generated_ids = generated_ids.squeeze(0)

        return self._decode_generation(generated_ids)

    def stream(  # type: ignore
        self,
        prompts: Union[str, List[str]],
        media: Union[Any, List[Any]],  # TODO: docstring
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> Iterator[Union[str, List[str]]]:
        raise NotImplementedError

generate(prompts, media, generation_parameters, logits_processor, sampling_parameters)

Generate text using transformers.

Parameters:

Name Type Description Default
prompts Union[str, List[str]]

A prompt or list of prompts.

required
media Union[List[Any], List[List[Any]]]

A List[PIL.Image] or List[List[PIL.Image]]

required
generation_parameters GenerationParameters

An instance of GenerationParameters that contains the prompt, the maximum number of tokens, stop sequences and seed. All the arguments to SequenceGeneratorAdapter's __cal__ method.

required
logits_processor Optional[OutlinesLogitsProcessor]

The logits processor to use when generating text.

required
sampling_parameters SamplingParameters

An instance of SamplingParameters, a dataclass that contains the name of the sampler to use and related parameters as available in Outlines.

required

Returns:

Type Description
The generated text
Source code in outlines/models/transformers_vision.py
def generate(  # type: ignore
    self,
    prompts: Union[str, List[str]],
    media: Union[List[Any], List[List[Any]]],
    generation_parameters: GenerationParameters,
    logits_processor: Optional["OutlinesLogitsProcessor"],
    sampling_parameters: SamplingParameters,
) -> Union[str, List[str], List[List[str]]]:
    """Generate text using `transformers`.

    Parameters
    ----------
    prompts
        A prompt or list of prompts.
    media
        A List[PIL.Image] or List[List[PIL.Image]]
    generation_parameters
        An instance of `GenerationParameters` that contains the prompt,
        the maximum number of tokens, stop sequences and seed. All the
        arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
    logits_processor
        The logits processor to use when generating text.
    sampling_parameters
        An instance of `SamplingParameters`, a dataclass that contains
        the name of the sampler to use and related parameters as available
        in Outlines.

    Returns
    -------
    The generated text
    """
    inputs = self.processor(
        text=prompts, images=media, padding=True, return_tensors="pt"
    ).to(self.model.device)

    generation_kwargs = self._get_generation_kwargs(
        prompts,
        generation_parameters,
        logits_processor,
        sampling_parameters,
    )
    generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

    # if single str input and single sample per input, convert to a 1D output
    if isinstance(prompts, str):
        # Should always be true until NotImplementedError above is fixed
        generated_ids = generated_ids.squeeze(0)

    return self._decode_generation(generated_ids)

transformers_vision(model_name, model_class, device=None, model_kwargs={}, processor_kwargs={}, tokenizer_class=None, processor_class=None)

Instantiate a model from the transformers library and its tokenizer.

Parameters:

Name Type Description Default
model_name str

The name of the model as listed on Hugging Face's model page.

required
model_class

The PreTrainedModel class from transformers to use in initializing the vision model from model_name. https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel

required
device Optional[str]

The device(s) on which the model should be loaded. This overrides the device_map entry in model_kwargs when provided.

None
model_kwargs dict

A dictionary that contains the keyword arguments to pass to the from_pretrained method when loading the model.

{}
processor_kwargs dict

A dictionary that contains the keyword arguments to pass to the from_pretrained method when loading the processor.

{}

Returns:

Type Description
A `TransformersModel` model instance.
Source code in outlines/models/transformers_vision.py
def transformers_vision(
    model_name: str,
    model_class,
    device: Optional[str] = None,
    model_kwargs: dict = {},
    processor_kwargs: dict = {},
    tokenizer_class=None,
    processor_class=None,
):
    """Instantiate a model from the `transformers` library and its tokenizer.

    Parameters
    ----------
    model_name
        The name of the model as listed on Hugging Face's model page.
    model_class
        The `PreTrainedModel` class from transformers to use in initializing the vision model from `model_name`.
        https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel
    device
        The device(s) on which the model should be loaded. This overrides
        the `device_map` entry in `model_kwargs` when provided.
    model_kwargs
        A dictionary that contains the keyword arguments to pass to the
        `from_pretrained` method when loading the model.
    processor_kwargs
        A dictionary that contains the keyword arguments to pass to the
        `from_pretrained` method when loading the processor.

    Returns
    -------
    A `TransformersModel` model instance.

    """
    if processor_class is None or tokenizer_class is None:
        try:
            from transformers import AutoProcessor, AutoTokenizer
        except ImportError:
            raise ImportError(
                "The `transformers` library needs to be installed in order to use `transformers` models."
            )
    if processor_class is None:
        processor_class = AutoProcessor
    if tokenizer_class is None:
        tokenizer_class = AutoTokenizer

    if device is not None:
        model_kwargs["device_map"] = device

    model = model_class.from_pretrained(model_name, **model_kwargs)

    processor_kwargs.setdefault("padding_side", "left")
    processor_kwargs.setdefault("pad_token", "[PAD]")
    processor = processor_class.from_pretrained(model_name, **processor_kwargs)

    if tokenizer_class is None:
        if getattr(processor, "tokenizer", None):
            tokenizer = processor.tokenizer
        else:
            tokenizer = AutoTokenizer.from_pretrained(model_name, **processor_kwargs)
    else:
        tokenizer = tokenizer_class.from_pretrained(model_name, **processor_kwargs)

    return TransformersVision(model, tokenizer, processor)

vllm

VLLM

Represents a vLLM model.

We wrap models from model providing libraries in order to give all of them the same interface in Outlines and allow users to easily switch between providers. This class wraps the vllm.LLM class from the vllm library.

Source code in outlines/models/vllm.py
class VLLM:
    """Represents a vLLM model.

    We wrap models from model providing libraries in order to give all of
    them the same interface in Outlines and allow users to easily switch
    between providers. This class wraps the `vllm.LLM` class from the
    `vllm` library.

    """

    def __init__(self, model: "LLM"):
        self.model = model
        self.lora_request = None

        self.tokenizer = self._get_tokenizer()

    def _get_tokenizer(self):
        if hasattr(self.model, "get_tokenizer"):
            tokenizer = self.model.get_tokenizer()
        elif hasattr(self.model, "tokenizer"):
            if hasattr(self.model.tokenizer, "tokenizer"):
                tokenizer = self.model.tokenizer.tokenizer
            else:
                tokenizer = self.model.tokenizer
        else:
            raise ValueError(
                "The provided LLM instance neither has a "
                "`tokenizer` attribute or a `get_tokenizer` method."
            )
        return adapt_tokenizer(tokenizer=tokenizer)

    def generate(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        logits_processor,
        sampling_parameters: SamplingParameters,
        *,
        sampling_params: Optional["SamplingParams"] = None,
        use_tqdm: bool = True,
    ):
        """Generate text using vLLM.

        Parameters
        ----------
        prompts
            A prompt or list of prompts.
        generation_parameters
            An instance of `GenerationParameters` that contains the prompt,
            the maximum number of tokens, stop sequences and seed. All the
            arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
        logits_processor
            The logits processor to use when generating text.
        sampling_parameters
            An instance of `SamplingParameters`, a dataclass that contains
            the name of the sampler to use and related parameters as available
            in Outlines.
        sampling_params
            An instance of `vllm.sampling_params.SamplingParams`. The values
            passed via this dataclass supersede the values of the parameters
            in `generation_parameters` and `sampling_parameters`. See the
            vLLM documentation for more details: https://docs.vllm.ai/en/latest/dev/sampling_params.html.
        use_tqdm
            A boolean in order to display progress bar while inferencing

        Returns
        -------
        The generated text, of shape `(n_batch, n_samples)`. If there are only
        one batch and several samples, the list is of shape `(n_samples)`. If
        this is a batch with several sequences but only one sample the list is
        of shape `(n_batch)`. If there is only one sequence and one sample, a
        string is returned.

        """
        from vllm.sampling_params import SamplingParams

        if sampling_params is None:
            sampling_params = SamplingParams()

        max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)

        # We only update the values in `sampling_params` if they
        # are specified by the user when calling the generator.
        if max_tokens is not None:
            sampling_params.max_tokens = max_tokens
        if stop_at is not None:
            if isinstance(stop_at, str):
                stop_at = [stop_at]
            sampling_params.stop = stop_at
        if seed is not None:
            sampling_params.seed = seed

        sampling_params.logits_processors = (
            [logits_processor] if logits_processor is not None else []
        )

        sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
            sampling_parameters
        )

        # We only update the values in `sampling_params` that
        # were not specified by the user.
        if sampling_params.n == 1:
            sampling_params.n = num_samples
            sampling_params.best_of = num_samples
        if top_p is not None and sampling_params.top_p == 1.0:
            sampling_params.top_p = top_p
        if top_k is not None and sampling_params.top_k == -1:
            sampling_params.top_k = top_k
            # TODO: remove this if statement once fixed
            # https://github.com/vllm-project/vllm/issues/5404#issuecomment-2175972897
            if top_k == 1:
                sampling_params.repetition_penalty = 0
        if temperature is not None and sampling_params.temperature == 1.0:
            sampling_params.temperature = temperature
        if sampler == "beam_search":
            sampling_params.use_beam_search = True

        results = self.model.generate(
            prompts,
            sampling_params=sampling_params,
            lora_request=self.lora_request,
            use_tqdm=use_tqdm,
        )
        results = [[sample.text for sample in batch.outputs] for batch in results]

        batch_size = len(results)
        sample_size = len(results[0])

        if batch_size == 1 and sample_size == 1:
            return results[0][0]
        elif batch_size == 1:
            return results[0]
        elif sample_size == 1:
            return [batch[0] for batch in results]

        return results

    def stream(self, *args, **kwargs):
        """Return a text generator.

        Streaming is not yet available for `vllm.LLM`.

        TODO: Implement the streaming functionality ourselves.

        """
        raise NotImplementedError(
            "Streaming is not available for the vLLM integration."
        )

    def load_lora(self, adapter_path: Optional[str]):
        from vllm.lora.request import LoRARequest

        if adapter_path is None:
            self.lora_request = None
        else:
            self.lora_request = LoRARequest(adapter_path, 1, adapter_path)

generate(prompts, generation_parameters, logits_processor, sampling_parameters, *, sampling_params=None, use_tqdm=True)

Generate text using vLLM.

Parameters:

Name Type Description Default
prompts Union[str, List[str]]

A prompt or list of prompts.

required
generation_parameters GenerationParameters

An instance of GenerationParameters that contains the prompt, the maximum number of tokens, stop sequences and seed. All the arguments to SequenceGeneratorAdapter's __cal__ method.

required
logits_processor

The logits processor to use when generating text.

required
sampling_parameters SamplingParameters

An instance of SamplingParameters, a dataclass that contains the name of the sampler to use and related parameters as available in Outlines.

required
sampling_params Optional[SamplingParams]

An instance of vllm.sampling_params.SamplingParams. The values passed via this dataclass supersede the values of the parameters in generation_parameters and sampling_parameters. See the vLLM documentation for more details: https://docs.vllm.ai/en/latest/dev/sampling_params.html.

None
use_tqdm bool

A boolean in order to display progress bar while inferencing

True

Returns:

Type Description
The generated text, of shape `(n_batch, n_samples)`. If there are only
one batch and several samples, the list is of shape `(n_samples)`. If
this is a batch with several sequences but only one sample the list is
of shape `(n_batch)`. If there is only one sequence and one sample, a
string is returned.
Source code in outlines/models/vllm.py
def generate(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    logits_processor,
    sampling_parameters: SamplingParameters,
    *,
    sampling_params: Optional["SamplingParams"] = None,
    use_tqdm: bool = True,
):
    """Generate text using vLLM.

    Parameters
    ----------
    prompts
        A prompt or list of prompts.
    generation_parameters
        An instance of `GenerationParameters` that contains the prompt,
        the maximum number of tokens, stop sequences and seed. All the
        arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
    logits_processor
        The logits processor to use when generating text.
    sampling_parameters
        An instance of `SamplingParameters`, a dataclass that contains
        the name of the sampler to use and related parameters as available
        in Outlines.
    sampling_params
        An instance of `vllm.sampling_params.SamplingParams`. The values
        passed via this dataclass supersede the values of the parameters
        in `generation_parameters` and `sampling_parameters`. See the
        vLLM documentation for more details: https://docs.vllm.ai/en/latest/dev/sampling_params.html.
    use_tqdm
        A boolean in order to display progress bar while inferencing

    Returns
    -------
    The generated text, of shape `(n_batch, n_samples)`. If there are only
    one batch and several samples, the list is of shape `(n_samples)`. If
    this is a batch with several sequences but only one sample the list is
    of shape `(n_batch)`. If there is only one sequence and one sample, a
    string is returned.

    """
    from vllm.sampling_params import SamplingParams

    if sampling_params is None:
        sampling_params = SamplingParams()

    max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)

    # We only update the values in `sampling_params` if they
    # are specified by the user when calling the generator.
    if max_tokens is not None:
        sampling_params.max_tokens = max_tokens
    if stop_at is not None:
        if isinstance(stop_at, str):
            stop_at = [stop_at]
        sampling_params.stop = stop_at
    if seed is not None:
        sampling_params.seed = seed

    sampling_params.logits_processors = (
        [logits_processor] if logits_processor is not None else []
    )

    sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
        sampling_parameters
    )

    # We only update the values in `sampling_params` that
    # were not specified by the user.
    if sampling_params.n == 1:
        sampling_params.n = num_samples
        sampling_params.best_of = num_samples
    if top_p is not None and sampling_params.top_p == 1.0:
        sampling_params.top_p = top_p
    if top_k is not None and sampling_params.top_k == -1:
        sampling_params.top_k = top_k
        # TODO: remove this if statement once fixed
        # https://github.com/vllm-project/vllm/issues/5404#issuecomment-2175972897
        if top_k == 1:
            sampling_params.repetition_penalty = 0
    if temperature is not None and sampling_params.temperature == 1.0:
        sampling_params.temperature = temperature
    if sampler == "beam_search":
        sampling_params.use_beam_search = True

    results = self.model.generate(
        prompts,
        sampling_params=sampling_params,
        lora_request=self.lora_request,
        use_tqdm=use_tqdm,
    )
    results = [[sample.text for sample in batch.outputs] for batch in results]

    batch_size = len(results)
    sample_size = len(results[0])

    if batch_size == 1 and sample_size == 1:
        return results[0][0]
    elif batch_size == 1:
        return results[0]
    elif sample_size == 1:
        return [batch[0] for batch in results]

    return results

stream(*args, **kwargs)

Return a text generator.

Streaming is not yet available for vllm.LLM.

TODO: Implement the streaming functionality ourselves.

Source code in outlines/models/vllm.py
def stream(self, *args, **kwargs):
    """Return a text generator.

    Streaming is not yet available for `vllm.LLM`.

    TODO: Implement the streaming functionality ourselves.

    """
    raise NotImplementedError(
        "Streaming is not available for the vLLM integration."
    )

adapt_tokenizer(tokenizer)

Adapt a tokenizer to use to compile the FSM.

The API of Outlines tokenizers is slightly different to that of transformers. In addition we need to handle the missing spaces to Llama's tokenizer to be able to compile FSMs for this model.

Parameters:

Name Type Description Default
tokenizer PreTrainedTokenizerBase

The tokenizer of the model.

required

Returns:

Type Description
PreTrainedTokenizerBase

The adapted tokenizer.

Source code in outlines/models/vllm.py
def adapt_tokenizer(tokenizer: "PreTrainedTokenizerBase") -> "PreTrainedTokenizerBase":
    """Adapt a tokenizer to use to compile the FSM.

    The API of Outlines tokenizers is slightly different to that of `transformers`. In
    addition we need to handle the missing spaces to Llama's tokenizer to be able to
    compile FSMs for this model.

    Parameters
    ----------
    tokenizer
        The tokenizer of the model.

    Returns
    -------
    PreTrainedTokenizerBase
        The adapted tokenizer.
    """
    from transformers import SPIECE_UNDERLINE

    tokenizer.vocabulary = tokenizer.get_vocab()
    tokenizer.special_tokens = set(tokenizer.all_special_tokens)

    def convert_token_to_string(token: Union[str, bytes]) -> str:
        string = tokenizer.convert_tokens_to_string([token])

        # A hack to handle missing spaces to HF's Llama tokenizers
        if (
            type(token) is str
            and token.startswith(SPIECE_UNDERLINE)
            or token == "<0x20>"
        ):
            return " " + string

        return string

    tokenizer.convert_token_to_string = convert_token_to_string

    return tokenizer

vllm(model_name, **vllm_model_params)

Load a vLLM model.

Parameters:

Name Type Description Default
model_name str

The name of the model to load from the HuggingFace hub.

required
vllm_model_params

vLLM-specific model parameters. See the vLLM code for the full list: https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py

{}
Source code in outlines/models/vllm.py
def vllm(model_name: str, **vllm_model_params):
    """Load a vLLM model.

    Parameters
    ---------
    model_name
        The name of the model to load from the HuggingFace hub.
    vllm_model_params
        vLLM-specific model parameters. See the vLLM code for the full list:
        https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py

    """
    from vllm import LLM

    model = LLM(model_name, **vllm_model_params)

    return VLLM(model)