Models

`TransformerTokenizer`

Bases: Tokenizer

Represents a tokenizer for models in the transformers library.

Source code in outlines/models/transformers.py

class TransformerTokenizer(Tokenizer):
    """Represents a tokenizer for models in the `transformers` library."""

    def __init__(self, tokenizer: "PreTrainedTokenizer", **kwargs):
        self.tokenizer = tokenizer
        self.eos_token_id = self.tokenizer.eos_token_id
        self.eos_token = self.tokenizer.eos_token

        if self.tokenizer.pad_token_id is None:
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
            self.pad_token_id = self.eos_token_id
        else:
            self.pad_token_id = self.tokenizer.pad_token_id
            self.pad_token = self.tokenizer.pad_token

        self.special_tokens = set(self.tokenizer.all_special_tokens)

        self.vocabulary = self.tokenizer.get_vocab()
        self.is_llama = isinstance(self.tokenizer, get_llama_tokenizer_types())

    def encode(
        self, prompt: Union[str, List[str]], **kwargs
    ) -> Tuple["torch.LongTensor", "torch.LongTensor"]:
        kwargs["padding"] = True
        kwargs["return_tensors"] = "pt"
        output = self.tokenizer(prompt, **kwargs)
        return output["input_ids"], output["attention_mask"]

    def decode(self, token_ids: "torch.LongTensor") -> List[str]:
        text = self.tokenizer.batch_decode(token_ids, skip_special_tokens=True)
        return text

    def convert_token_to_string(self, token: str) -> str:
        from transformers.file_utils import SPIECE_UNDERLINE

        string = self.tokenizer.convert_tokens_to_string([token])

        if self.is_llama:
            # A hack to handle missing spaces to HF's Llama tokenizers
            if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
                return " " + string

        return string

    def __eq__(self, other):
        if isinstance(other, type(self)):
            if hasattr(self, "model_name") and hasattr(self, "kwargs"):
                return (
                    other.model_name == self.model_name and other.kwargs == self.kwargs
                )
            else:
                return other.tokenizer == self.tokenizer
        return NotImplemented

    def __hash__(self):
        return hash(Hasher.hash(self.tokenizer))

    def __getstate__(self):
        state = {"tokenizer": self.tokenizer}
        return state

    def __setstate__(self, state):
        self.__init__(state["tokenizer"])

`Transformers`

Represents a transformers model.

Source code in outlines/models/transformers.py

class Transformers:
    """Represents a `transformers` model."""

    def __init__(
        self,
        model: "PreTrainedModel",
        tokenizer: "PreTrainedTokenizer",
    ):
        self.model = model
        self.tokenizer = TransformerTokenizer(tokenizer)

    def forward(
        self,
        input_ids: "torch.LongTensor",
        attention_mask: "torch.LongTensor",
        past_key_values: Optional[Tuple] = None,
    ) -> Tuple["torch.FloatTensor", Optional[KVCacheType]]:
        """Compute a forward pass through the transformer model.

        Parameters
        ----------
        input_ids
            The input token ids.  Must be one or two dimensional.
        attention_mask
            The attention mask.  Must be one or two dimensional.
        past_key_values
            A tuple of tuples containing the cached key and value tensors for each
            attention head.

        Returns
        -------
        The computed logits and the new cached key and value tensors.

        """
        try:
            import torch
        except ImportError:
            ImportError(
                "The `torch` library needs to be installed to use `transformers` models."
            )
        assert 0 < input_ids.ndim < 3

        if past_key_values:
            input_ids = input_ids[..., -1].unsqueeze(-1)

        with torch.inference_mode():
            output = self.model(
                input_ids,
                attention_mask=attention_mask,
                return_dict=True,
                output_attentions=False,
                output_hidden_states=False,
                past_key_values=past_key_values,
            )

        return output.logits, output.past_key_values

    def __call__(
        self,
        input_ids: "torch.LongTensor",
        attention_mask: "torch.LongTensor",
        past_key_values: Optional[Tuple] = None,
    ) -> "torch.FloatTensor":
        logits, kv_cache = self.forward(input_ids, attention_mask, past_key_values)
        next_token_logits = logits[..., -1, :]

        return next_token_logits, kv_cache

    def generate(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> Union[str, List[str], List[List[str]]]:
        """Generate text using `transformers`.

        Arguments
        ---------
        prompts
            A prompt or list of prompts.
        generation_parameters
            An instance of `GenerationParameters` that contains the prompt,
            the maximum number of tokens, stop sequences and seed. All the
            arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
        logits_processor
            The logits processor to use when generating text.
        sampling_parameters
            An instance of `SamplingParameters`, a dataclass that contains
            the name of the sampler to use and related parameters as available
            in Outlines.

        Returns
        -------
        The generated text
        """
        if isinstance(prompts, str):
            # convert to 2d
            input_ids, attention_mask = self.tokenizer.encode([prompts])
        else:
            input_ids, attention_mask = self.tokenizer.encode(prompts)

        inputs = {
            "input_ids": input_ids.to(self.model.device),
            "attention_mask": attention_mask.to(self.model.device),
        }
        if (
            "attention_mask"
            not in inspect.signature(self.model.forward).parameters.keys()
        ):
            del inputs["attention_mask"]

        generation_kwargs = self._get_generation_kwargs(
            prompts,
            generation_parameters,
            logits_processor,
            sampling_parameters,
        )
        generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

        # if single str input and single sample per input, convert to a 1D output
        if isinstance(prompts, str):
            generated_ids = generated_ids.squeeze(0)

        return self._decode_generation(generated_ids)

    def stream(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> Iterator[Union[str, List[str]]]:
        """
        Temporary stream stand-in which implements stream() signature
        and equivalent behaviour but isn't yielded until generation completes.

        TODO: implement following completion of https://github.com/huggingface/transformers/issues/30810
        """
        if isinstance(prompts, str):
            # convert to 2d
            input_ids, attention_mask = self.tokenizer.encode([prompts])
        else:
            input_ids, attention_mask = self.tokenizer.encode(prompts)
        inputs = {
            "input_ids": input_ids.to(self.model.device),
            "attention_mask": attention_mask.to(self.model.device),
        }
        if (
            "attention_mask"
            not in inspect.signature(self.model.forward).parameters.keys()
        ):
            del inputs["attention_mask"]

        generation_kwargs = self._get_generation_kwargs(
            prompts,
            generation_parameters,
            logits_processor,
            sampling_parameters,
        )
        generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

        # if single str input and single sample per input, convert to a 1D output
        if isinstance(prompts, str):
            generated_ids = generated_ids.squeeze(0)

        for i in range(generated_ids.size(-1)):
            output_group_ids = generated_ids.select(-1, i).unsqueeze(-1)
            yield self._decode_generation(output_group_ids)

    def _get_generation_kwargs(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> dict:
        """
        Conert outlines generation parameters into model.generate kwargs
        """
        from transformers import GenerationConfig, LogitsProcessorList, set_seed

        max_new_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)
        sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
            sampling_parameters
        )
        if max_new_tokens is None:
            max_new_tokens = int(2**30)

        # global seed, not desirable
        if seed is not None:
            set_seed(seed)

        if logits_processor is not None:
            logits_processor_list = LogitsProcessorList([logits_processor])
        else:
            logits_processor_list = None

        generation_config = GenerationConfig(
            max_new_tokens=max_new_tokens,
            stop_strings=stop_at,
            num_return_sequences=(num_samples or 1),
            top_p=top_p,
            top_k=top_k,
            temperature=temperature,
            do_sample=(sampler == "multinomial"),
            num_beams=(num_samples if sampler == "beam_search" else 1),
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.pad_token_id,
        )

        return dict(
            logits_processor=logits_processor_list,
            generation_config=generation_config,
            tokenizer=self.tokenizer.tokenizer,
        )

    def _generate_output_seq(
        self, prompts, inputs, generation_config, **generation_kwargs
    ):
        input_ids = inputs["input_ids"]
        output_ids = self.model.generate(
            **inputs, generation_config=generation_config, **generation_kwargs
        )

        # encoder-decoder returns output_ids only, decoder-only returns full seq ids
        if self.model.config.is_encoder_decoder:
            generated_ids = output_ids
        else:
            generated_ids = output_ids[:, input_ids.shape[1] :]

        # if batch list inputs AND multiple samples per input, convert generated_id to 3D view
        num_samples = generation_config.num_return_sequences or 1

        if num_samples > 1 and isinstance(prompts, list):
            batch_size = input_ids.size(0)
            num_return_sequences = generation_config.num_return_sequences or 1
            generated_ids = generated_ids.view(batch_size, num_return_sequences, -1)

        return generated_ids

    def _decode_generation(self, generated_ids: "torch.Tensor"):
        if len(generated_ids.shape) == 1:
            return self.tokenizer.decode([generated_ids])[0]
        elif len(generated_ids.shape) == 2:
            return self.tokenizer.decode(generated_ids)
        elif len(generated_ids.shape) == 3:
            return [
                self.tokenizer.decode(generated_ids[i])
                for i in range(len(generated_ids))
            ]
        else:
            raise TypeError(
                f"Generated outputs aren't 1D, 2D or 3D, but instead are {generated_ids.shape}"
            )

`forward(input_ids, attention_mask, past_key_values=None)`

Compute a forward pass through the transformer model.

Parameters

input_ids The input token ids. Must be one or two dimensional. attention_mask The attention mask. Must be one or two dimensional. past_key_values A tuple of tuples containing the cached key and value tensors for each attention head.

Returns

The computed logits and the new cached key and value tensors.

Source code in outlines/models/transformers.py

def forward(
    self,
    input_ids: "torch.LongTensor",
    attention_mask: "torch.LongTensor",
    past_key_values: Optional[Tuple] = None,
) -> Tuple["torch.FloatTensor", Optional[KVCacheType]]:
    """Compute a forward pass through the transformer model.

    Parameters
    ----------
    input_ids
        The input token ids.  Must be one or two dimensional.
    attention_mask
        The attention mask.  Must be one or two dimensional.
    past_key_values
        A tuple of tuples containing the cached key and value tensors for each
        attention head.

    Returns
    -------
    The computed logits and the new cached key and value tensors.

    """
    try:
        import torch
    except ImportError:
        ImportError(
            "The `torch` library needs to be installed to use `transformers` models."
        )
    assert 0 < input_ids.ndim < 3

    if past_key_values:
        input_ids = input_ids[..., -1].unsqueeze(-1)

    with torch.inference_mode():
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            return_dict=True,
            output_attentions=False,
            output_hidden_states=False,
            past_key_values=past_key_values,
        )

    return output.logits, output.past_key_values

`generate(prompts, generation_parameters, logits_processor, sampling_parameters)`

Generate text using transformers.

Arguments

prompts A prompt or list of prompts. generation_parameters An instance of GenerationParameters that contains the prompt, the maximum number of tokens, stop sequences and seed. All the arguments to SequenceGeneratorAdapter's __cal__ method. logits_processor The logits processor to use when generating text. sampling_parameters An instance of SamplingParameters, a dataclass that contains the name of the sampler to use and related parameters as available in Outlines.

Returns

The generated text

Source code in outlines/models/transformers.py

def generate(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    logits_processor: Optional["OutlinesLogitsProcessor"],
    sampling_parameters: SamplingParameters,
) -> Union[str, List[str], List[List[str]]]:
    """Generate text using `transformers`.

    Arguments
    ---------
    prompts
        A prompt or list of prompts.
    generation_parameters
        An instance of `GenerationParameters` that contains the prompt,
        the maximum number of tokens, stop sequences and seed. All the
        arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
    logits_processor
        The logits processor to use when generating text.
    sampling_parameters
        An instance of `SamplingParameters`, a dataclass that contains
        the name of the sampler to use and related parameters as available
        in Outlines.

    Returns
    -------
    The generated text
    """
    if isinstance(prompts, str):
        # convert to 2d
        input_ids, attention_mask = self.tokenizer.encode([prompts])
    else:
        input_ids, attention_mask = self.tokenizer.encode(prompts)

    inputs = {
        "input_ids": input_ids.to(self.model.device),
        "attention_mask": attention_mask.to(self.model.device),
    }
    if (
        "attention_mask"
        not in inspect.signature(self.model.forward).parameters.keys()
    ):
        del inputs["attention_mask"]

    generation_kwargs = self._get_generation_kwargs(
        prompts,
        generation_parameters,
        logits_processor,
        sampling_parameters,
    )
    generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

    # if single str input and single sample per input, convert to a 1D output
    if isinstance(prompts, str):
        generated_ids = generated_ids.squeeze(0)

    return self._decode_generation(generated_ids)

`stream(prompts, generation_parameters, logits_processor, sampling_parameters)`

Temporary stream stand-in which implements stream() signature and equivalent behaviour but isn't yielded until generation completes.

TODO: implement following completion of https://github.com/huggingface/transformers/issues/30810

Source code in outlines/models/transformers.py

def stream(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    logits_processor: Optional["OutlinesLogitsProcessor"],
    sampling_parameters: SamplingParameters,
) -> Iterator[Union[str, List[str]]]:
    """
    Temporary stream stand-in which implements stream() signature
    and equivalent behaviour but isn't yielded until generation completes.

    TODO: implement following completion of https://github.com/huggingface/transformers/issues/30810
    """
    if isinstance(prompts, str):
        # convert to 2d
        input_ids, attention_mask = self.tokenizer.encode([prompts])
    else:
        input_ids, attention_mask = self.tokenizer.encode(prompts)
    inputs = {
        "input_ids": input_ids.to(self.model.device),
        "attention_mask": attention_mask.to(self.model.device),
    }
    if (
        "attention_mask"
        not in inspect.signature(self.model.forward).parameters.keys()
    ):
        del inputs["attention_mask"]

    generation_kwargs = self._get_generation_kwargs(
        prompts,
        generation_parameters,
        logits_processor,
        sampling_parameters,
    )
    generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

    # if single str input and single sample per input, convert to a 1D output
    if isinstance(prompts, str):
        generated_ids = generated_ids.squeeze(0)

    for i in range(generated_ids.size(-1)):
        output_group_ids = generated_ids.select(-1, i).unsqueeze(-1)
        yield self._decode_generation(output_group_ids)

`get_llama_tokenizer_types()`

Get all the Llama tokenizer types/classes that need work-arounds.

When they can't be imported, a dummy class is created.

Source code in outlines/models/transformers.py

def get_llama_tokenizer_types():
    """Get all the Llama tokenizer types/classes that need work-arounds.

    When they can't be imported, a dummy class is created.

    """
    try:
        from transformers.models.llama import LlamaTokenizer
    except ImportError:

        class LlamaTokenizer:  # type: ignore
            pass

    try:
        from transformers.models.llama import LlamaTokenizerFast
    except ImportError:

        class LlamaTokenizerFast:  # type: ignore
            pass

    try:
        from transformers.models.code_llama import CodeLlamaTokenizer
    except ImportError:

        class CodeLlamaTokenizer:  # type: ignore
            pass

    try:
        from transformers.models.code_llama import CodeLlamaTokenizerFast
    except ImportError:

        class CodeLlamaTokenizerFast:  # type: ignore
            pass

    return (
        LlamaTokenizer,
        LlamaTokenizerFast,
        CodeLlamaTokenizer,
        CodeLlamaTokenizerFast,
    )

`transformers(model_name, device=None, model_kwargs={}, tokenizer_kwargs={}, model_class=None, tokenizer_class=None)`

Instantiate a model from the transformers library and its tokenizer.

Parameters

model_name The name of the model as listed on Hugging Face's model page. device The device(s) on which the model should be loaded. This overrides the device_map entry in model_kwargs when provided. model_kwargs A dictionary that contains the keyword arguments to pass to the from_pretrained method when loading the model. tokenizer_kwargs A dictionary that contains the keyword arguments to pass to the from_pretrained method when loading the tokenizer.

Returns

A TransformersModel model instance.

Source code in outlines/models/transformers.py

def transformers(
    model_name: str,
    device: Optional[str] = None,
    model_kwargs: dict = {},
    tokenizer_kwargs: dict = {},
    model_class=None,
    tokenizer_class=None,
):
    """Instantiate a model from the `transformers` library and its tokenizer.

    Parameters
    ----------
    model_name
        The name of the model as listed on Hugging Face's model page.
    device
        The device(s) on which the model should be loaded. This overrides
        the `device_map` entry in `model_kwargs` when provided.
    model_kwargs
        A dictionary that contains the keyword arguments to pass to the
        `from_pretrained` method when loading the model.
    tokenizer_kwargs
        A dictionary that contains the keyword arguments to pass to the
        `from_pretrained` method when loading the tokenizer.

    Returns
    -------
    A `TransformersModel` model instance.

    """
    if model_class is None or tokenizer_class is None:
        try:
            from transformers import AutoModelForCausalLM, AutoTokenizer
        except ImportError:
            raise ImportError(
                "The `transformers` library needs to be installed in order to use `transformers` models."
            )
    if model_class is None:
        model_class = AutoModelForCausalLM
    if tokenizer_class is None:
        tokenizer_class = AutoTokenizer

    if device is not None:
        model_kwargs["device_map"] = device

    model = model_class.from_pretrained(model_name, **model_kwargs)

    tokenizer_kwargs.setdefault("padding_side", "left")
    tokenizer = tokenizer_class.from_pretrained(model_name, **tokenizer_kwargs)

    return Transformers(model, tokenizer)

Integration with OpenAI's API.

`OpenAI`

An object that represents the OpenAI API.

Source code in outlines/models/openai.py

class OpenAI:
    """An object that represents the OpenAI API."""

    def __init__(
        self,
        client,
        config,
        system_prompt: Optional[str] = None,
    ):
        """Create an `OpenAI` instance.

        This class supports the standard OpenAI API, the Azure OpeanAI API as
        well as compatible APIs that rely on the OpenAI client.

        Parameters
        ----------
        client
            An instance of the API's async client.
        config
            An instance of `OpenAIConfig`. Can be useful to specify some
            parameters that cannot be set by calling this class' methods.
        """

        self.client = client
        self.config = config

        # We count the total number of prompt and generated tokens as returned
        # by the OpenAI API, summed over all the requests performed with this
        # model instance.
        self.prompt_tokens = 0
        self.completion_tokens = 0

        self.format_sequence = lambda seq: seq

    def __call__(
        self,
        prompt: Union[str, List[str]],
        max_tokens: Optional[int] = None,
        stop_at: Optional[Union[List[str], str]] = None,
        *,
        system_prompt: Optional[str] = None,
        temperature: Optional[float] = None,
        samples: Optional[int] = None,
    ) -> np.ndarray:
        """Call the OpenAI API to generate text.

        Parameters
        ----------
        prompt
            A string or list of strings that will be used to prompt the model
        max_tokens
            The maximum number of tokens to generate
        stop_at
            A string or array of strings which, such that the generation stops
            when they are generated.
        system_prompt
            The content of the system message that precedes the user's prompt.
        temperature
            The value of the temperature used to sample tokens
        samples
            The number of completions to generate for each prompt
        stop_at
            Up to 4 words where the API will stop the completion.

        """
        if max_tokens is None:
            max_tokens = self.config.max_tokens
        if stop_at is None:
            stop_at = self.config.stop
        if temperature is None:
            temperature = self.config.temperature
        if samples is None:
            samples = self.config.n

        config = replace(self.config, max_tokens=max_tokens, temperature=temperature, n=samples, stop=stop_at)  # type: ignore

        response, prompt_tokens, completion_tokens = generate_chat(
            prompt, system_prompt, self.client, config
        )
        self.prompt_tokens += prompt_tokens
        self.completion_tokens += completion_tokens

        return self.format_sequence(response)

    def stream(self, *args, **kwargs):
        raise NotImplementedError(
            "Streaming is currently not supported for the OpenAI API"
        )

    def new_with_replacements(self, **kwargs):
        new_instance = copy.copy(self)
        new_instance.config = replace(new_instance.config, **kwargs)
        return new_instance

    def __str__(self):
        return self.__class__.__name__ + " API"

    def __repr__(self):
        return str(self.config)

`call(prompt, max_tokens=None, stop_at=None, *, system_prompt=None, temperature=None, samples=None)`

Call the OpenAI API to generate text.

Parameters

prompt A string or list of strings that will be used to prompt the model max_tokens The maximum number of tokens to generate stop_at A string or array of strings which, such that the generation stops when they are generated. system_prompt The content of the system message that precedes the user's prompt. temperature The value of the temperature used to sample tokens samples The number of completions to generate for each prompt stop_at Up to 4 words where the API will stop the completion.

Source code in outlines/models/openai.py

def __call__(
    self,
    prompt: Union[str, List[str]],
    max_tokens: Optional[int] = None,
    stop_at: Optional[Union[List[str], str]] = None,
    *,
    system_prompt: Optional[str] = None,
    temperature: Optional[float] = None,
    samples: Optional[int] = None,
) -> np.ndarray:
    """Call the OpenAI API to generate text.

    Parameters
    ----------
    prompt
        A string or list of strings that will be used to prompt the model
    max_tokens
        The maximum number of tokens to generate
    stop_at
        A string or array of strings which, such that the generation stops
        when they are generated.
    system_prompt
        The content of the system message that precedes the user's prompt.
    temperature
        The value of the temperature used to sample tokens
    samples
        The number of completions to generate for each prompt
    stop_at
        Up to 4 words where the API will stop the completion.

    """
    if max_tokens is None:
        max_tokens = self.config.max_tokens
    if stop_at is None:
        stop_at = self.config.stop
    if temperature is None:
        temperature = self.config.temperature
    if samples is None:
        samples = self.config.n

    config = replace(self.config, max_tokens=max_tokens, temperature=temperature, n=samples, stop=stop_at)  # type: ignore

    response, prompt_tokens, completion_tokens = generate_chat(
        prompt, system_prompt, self.client, config
    )
    self.prompt_tokens += prompt_tokens
    self.completion_tokens += completion_tokens

    return self.format_sequence(response)

`init(client, config, system_prompt=None)`

Create an OpenAI instance.

This class supports the standard OpenAI API, the Azure OpeanAI API as well as compatible APIs that rely on the OpenAI client.

Parameters

client An instance of the API's async client. config An instance of OpenAIConfig. Can be useful to specify some parameters that cannot be set by calling this class' methods.

Source code in outlines/models/openai.py

def __init__(
    self,
    client,
    config,
    system_prompt: Optional[str] = None,
):
    """Create an `OpenAI` instance.

    This class supports the standard OpenAI API, the Azure OpeanAI API as
    well as compatible APIs that rely on the OpenAI client.

    Parameters
    ----------
    client
        An instance of the API's async client.
    config
        An instance of `OpenAIConfig`. Can be useful to specify some
        parameters that cannot be set by calling this class' methods.
    """

    self.client = client
    self.config = config

    # We count the total number of prompt and generated tokens as returned
    # by the OpenAI API, summed over all the requests performed with this
    # model instance.
    self.prompt_tokens = 0
    self.completion_tokens = 0

    self.format_sequence = lambda seq: seq

`OpenAIConfig` `dataclass`

Represents the parameters of the OpenAI API.

The information was last fetched on 2023/11/20. We document below the properties that are specific to the OpenAI API. Not all these properties are supported by Outlines.

Properties

model The name of the model. Available models can be found on OpenAI's website. frequence_penalty Number between 2.0 and -2.0. Positive values penalize new tokens based on their existing frequency in the text, logit_bias Modifies the likelihood of specified tokens to appear in the completion. Number between -100 (forbid) and +100 (only allows). n The number of completions to return for each prompt. presence_penalty Similar to frequency penalty. response_format Specifies the format the model must output. {"type": "json_object"} enables JSON mode. seed Two completions with the same seed value should return the same completion. This is however not guaranteed. stop Up to 4 words where the API will stop the completion. temperature Number between 0 and 2. Higher values make the output more random, while lower values make it more deterministic. top_p Number between 0 and 1. Parameter for nucleus sampling. user A unique identifier for the end-user.

Source code in outlines/models/openai.py

@dataclass(frozen=True)
class OpenAIConfig:
    """Represents the parameters of the OpenAI API.

    The information was last fetched on 2023/11/20. We document below the
    properties that are specific to the OpenAI API. Not all these properties are
    supported by Outlines.

    Properties
    ----------
    model
        The name of the model. Available models can be found on OpenAI's website.
    frequence_penalty
        Number between 2.0 and -2.0. Positive values penalize new tokens based on
        their existing frequency in the text,
    logit_bias
        Modifies the likelihood of specified tokens to appear in the completion.
        Number between -100 (forbid) and +100 (only allows).
    n
        The number of completions to return for each prompt.
    presence_penalty
        Similar to frequency penalty.
    response_format
        Specifies the format the model must output. `{"type": "json_object"}`
        enables JSON mode.
    seed
        Two completions with the same `seed` value should return the same
        completion. This is however not guaranteed.
    stop
        Up to 4 words where the API will stop the completion.
    temperature
        Number between 0 and 2. Higher values make the output more random, while
        lower values make it more deterministic.
    top_p
        Number between 0 and 1. Parameter for nucleus sampling.
    user
        A unique identifier for the end-user.

    """

    model: str = ""
    frequency_penalty: float = 0
    logit_bias: Dict[int, int] = field(default_factory=dict)
    max_tokens: Optional[int] = None
    n: int = 1
    presence_penalty: float = 0
    response_format: Optional[Dict[str, str]] = None
    seed: Optional[int] = None
    stop: Optional[Union[str, List[str]]] = None
    temperature: float = 1.0
    top_p: int = 1
    user: str = field(default_factory=str)

`error_handler(api_call_fn)`

Handle OpenAI API errors and missing API key.

Source code in outlines/models/openai.py

def error_handler(api_call_fn: Callable) -> Callable:
    """Handle OpenAI API errors and missing API key."""

    def call(*args, **kwargs):
        import openai

        try:
            return api_call_fn(*args, **kwargs)
        except (
            openai.APITimeoutError,
            openai.InternalServerError,
            openai.RateLimitError,
        ) as e:
            raise OSError(f"Could not connect to the OpenAI API: {e}")
        except (
            openai.AuthenticationError,
            openai.BadRequestError,
            openai.ConflictError,
            openai.PermissionDeniedError,
            openai.NotFoundError,
            openai.UnprocessableEntityError,
        ) as e:
            raise e

    return call

`generate_chat(prompt, system_prompt, client, config)` `async`

Call OpenAI's Chat Completion API.

Parameters

prompt The prompt we use to start the generation. Passed to the model with the "user" role. system_prompt The system prompt, passed to the model with the "system" role before the prompt. client The API client config An OpenAIConfig instance.

Returns

A tuple that contains the model's response(s) and usage statistics.

Source code in outlines/models/openai.py

@functools.partial(vectorize, signature="(),(),(),()->(s),(),()")
async def generate_chat(
    prompt: str,
    system_prompt: Union[str, None],
    client,
    config: OpenAIConfig,
) -> Tuple[np.ndarray, int, int]:
    """Call OpenAI's Chat Completion API.

    Parameters
    ----------
    prompt
        The prompt we use to start the generation. Passed to the model
        with the "user" role.
    system_prompt
        The system prompt, passed to the model with the "system" role
        before the prompt.
    client
        The API client
    config
        An `OpenAIConfig` instance.

    Returns
    -------
    A tuple that contains the model's response(s) and usage statistics.

    """

    @error_handler
    @cache()
    async def call_api(prompt, system_prompt, config):
        responses = await client.chat.completions.create(
            messages=system_message + user_message,
            **asdict(config),  # type: ignore
        )
        return responses.model_dump()

    system_message = (
        [{"role": "system", "content": system_prompt}] if system_prompt else []
    )
    user_message = [{"role": "user", "content": prompt}]

    responses = await call_api(prompt, system_prompt, config)

    results = np.array(
        [responses["choices"][i]["message"]["content"] for i in range(config.n)]
    )
    usage = responses["usage"]

    return results, usage["prompt_tokens"], usage["completion_tokens"]

Models

TransformerTokenizer

Transformers

forward(input_ids, attention_mask, past_key_values=None)

Parameters

Returns

generate(prompts, generation_parameters, logits_processor, sampling_parameters)

Arguments

Returns

stream(prompts, generation_parameters, logits_processor, sampling_parameters)

get_llama_tokenizer_types()

transformers(model_name, device=None, model_kwargs={}, tokenizer_kwargs={}, model_class=None, tokenizer_class=None)

Parameters

Returns

OpenAI

__call__(prompt, max_tokens=None, stop_at=None, *, system_prompt=None, temperature=None, samples=None)

Parameters

__init__(client, config, system_prompt=None)

Parameters

OpenAIConfig dataclass

Properties

error_handler(api_call_fn)

generate_chat(prompt, system_prompt, client, config) async

Parameters

Returns

`TransformerTokenizer`

`Transformers`

`forward(input_ids, attention_mask, past_key_values=None)`

`generate(prompts, generation_parameters, logits_processor, sampling_parameters)`

`stream(prompts, generation_parameters, logits_processor, sampling_parameters)`

`get_llama_tokenizer_types()`

`transformers(model_name, device=None, model_kwargs={}, tokenizer_kwargs={}, model_class=None, tokenizer_class=None)`

`OpenAI`

`call(prompt, max_tokens=None, stop_at=None, *, system_prompt=None, temperature=None, samples=None)`

`init(client, config, system_prompt=None)`

`OpenAIConfig` `dataclass`

`error_handler(api_call_fn)`

`generate_chat(prompt, system_prompt, client, config)` `async`