Skip to content

llamacpp

Integration with the llama-cpp-python library.

LlamaCpp

Bases: Model

Thin wrapper around the llama_cpp.Llama model.

This wrapper is used to convert the input and output types specified by the users at a higher level to arguments to the llama_cpp.Llama model.

Source code in outlines/models/llamacpp.py
class LlamaCpp(Model):
    """Thin wrapper around the `llama_cpp.Llama` model.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `llama_cpp.Llama` model.
    """

    tensor_library_name = "numpy"

    def __init__(self, model: "Llama", chat_mode: bool = True):
        """
        Parameters
        ----------
        model
            A `llama_cpp.Llama` model instance.
        chat_mode
            Whether to enable chat mode. If `False`, the model will regard
            all `str` inputs as plain text prompts. If `True`, the model will
            regard all `str` inputs as user messages in a chat conversation.

        """
        self.model = model
        self.tokenizer = LlamaCppTokenizer(self.model)

        # Note: llama-cpp-python provides a default chat-template fallback even when
        # the user hasn't explicitly configured one:
        # https://github.com/abetlen/llama-cpp-python/blob/c37132b/llama_cpp/llama.py#L540-L545
        # We keep the default as True because the upstream library generally favors chat-style usage.
        self.type_adapter = LlamaCppTypeAdapter(has_chat_template=chat_mode)

    def generate(
        self,
        model_input: Union[Chat, str],
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **inference_kwargs: Any,
    ) -> str:
        """Generate text using `llama-cpp-python`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        **inference_kwargs
            Additional keyword arguments to pass to the `Llama.__call__`
            method of the `llama-cpp-python` library.

        Returns
        -------
        str
            The text generated by the model.

        """
        prompt = self.type_adapter.format_input(model_input)

        if isinstance(prompt, str):
            completion = self.model(
                prompt,
                logits_processor=self.type_adapter.format_output_type(output_type),
                **inference_kwargs,
            )
            result = completion["choices"][0]["text"]
        elif isinstance(prompt, list):
            completion = self.model.create_chat_completion(
                prompt,
                logits_processor=self.type_adapter.format_output_type(output_type),
                **inference_kwargs,
            )
            result = completion["choices"][0]["message"]["content"]
        else:  # Never reached  # pragma: no cover
            raise ValueError("Unexpected prompt type.")

        self.model.reset()

        return result

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError("LlamaCpp does not support batch generation.")

    def generate_stream(
        self,
        model_input: Union[Chat, str],
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **inference_kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using `llama-cpp-python`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        **inference_kwargs
            Additional keyword arguments to pass to the `Llama.__call__`
            method of the `llama-cpp-python` library.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        prompt = self.type_adapter.format_input(model_input)

        if isinstance(prompt, str):
            generator = self.model(
                prompt,
                logits_processor=self.type_adapter.format_output_type(output_type),
                stream=True,
                **inference_kwargs,
            )
            for chunk in generator:
                yield chunk["choices"][0]["text"]

        elif isinstance(prompt, list):
            generator = self.model.create_chat_completion(
                prompt,
                logits_processor=self.type_adapter.format_output_type(output_type),
                stream=True,
                **inference_kwargs,
            )
            for chunk in generator:
                yield chunk["choices"][0]["delta"].get("content", "")
        else:  # Never reached  # pragma: no cover
            raise ValueError("Unexpected prompt type.")

__init__(model, chat_mode=True)

Parameters:

Name Type Description Default
model Llama

A llama_cpp.Llama model instance.

required
chat_mode bool

Whether to enable chat mode. If False, the model will regard all str inputs as plain text prompts. If True, the model will regard all str inputs as user messages in a chat conversation.

True
Source code in outlines/models/llamacpp.py
def __init__(self, model: "Llama", chat_mode: bool = True):
    """
    Parameters
    ----------
    model
        A `llama_cpp.Llama` model instance.
    chat_mode
        Whether to enable chat mode. If `False`, the model will regard
        all `str` inputs as plain text prompts. If `True`, the model will
        regard all `str` inputs as user messages in a chat conversation.

    """
    self.model = model
    self.tokenizer = LlamaCppTokenizer(self.model)

    # Note: llama-cpp-python provides a default chat-template fallback even when
    # the user hasn't explicitly configured one:
    # https://github.com/abetlen/llama-cpp-python/blob/c37132b/llama_cpp/llama.py#L540-L545
    # We keep the default as True because the upstream library generally favors chat-style usage.
    self.type_adapter = LlamaCppTypeAdapter(has_chat_template=chat_mode)

generate(model_input, output_type=None, **inference_kwargs)

Generate text using llama-cpp-python.

Parameters:

Name Type Description Default
model_input Union[Chat, str]

The prompt based on which the model will generate a response.

required
output_type Optional[OutlinesLogitsProcessor]

The logits processor the model will use to constrain the format of the generated text.

None
**inference_kwargs Any

Additional keyword arguments to pass to the Llama.__call__ method of the llama-cpp-python library.

{}

Returns:

Type Description
str

The text generated by the model.

Source code in outlines/models/llamacpp.py
def generate(
    self,
    model_input: Union[Chat, str],
    output_type: Optional[OutlinesLogitsProcessor] = None,
    **inference_kwargs: Any,
) -> str:
    """Generate text using `llama-cpp-python`.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The logits processor the model will use to constrain the format of
        the generated text.
    **inference_kwargs
        Additional keyword arguments to pass to the `Llama.__call__`
        method of the `llama-cpp-python` library.

    Returns
    -------
    str
        The text generated by the model.

    """
    prompt = self.type_adapter.format_input(model_input)

    if isinstance(prompt, str):
        completion = self.model(
            prompt,
            logits_processor=self.type_adapter.format_output_type(output_type),
            **inference_kwargs,
        )
        result = completion["choices"][0]["text"]
    elif isinstance(prompt, list):
        completion = self.model.create_chat_completion(
            prompt,
            logits_processor=self.type_adapter.format_output_type(output_type),
            **inference_kwargs,
        )
        result = completion["choices"][0]["message"]["content"]
    else:  # Never reached  # pragma: no cover
        raise ValueError("Unexpected prompt type.")

    self.model.reset()

    return result

generate_stream(model_input, output_type=None, **inference_kwargs)

Stream text using llama-cpp-python.

Parameters:

Name Type Description Default
model_input Union[Chat, str]

The prompt based on which the model will generate a response.

required
output_type Optional[OutlinesLogitsProcessor]

The logits processor the model will use to constrain the format of the generated text.

None
**inference_kwargs Any

Additional keyword arguments to pass to the Llama.__call__ method of the llama-cpp-python library.

{}

Returns:

Type Description
Iterator[str]

An iterator that yields the text generated by the model.

Source code in outlines/models/llamacpp.py
def generate_stream(
    self,
    model_input: Union[Chat, str],
    output_type: Optional[OutlinesLogitsProcessor] = None,
    **inference_kwargs: Any,
) -> Iterator[str]:
    """Stream text using `llama-cpp-python`.

    Parameters
    ----------
    model_input
        The prompt based on which the model will generate a response.
    output_type
        The logits processor the model will use to constrain the format of
        the generated text.
    **inference_kwargs
        Additional keyword arguments to pass to the `Llama.__call__`
        method of the `llama-cpp-python` library.

    Returns
    -------
    Iterator[str]
        An iterator that yields the text generated by the model.

    """
    prompt = self.type_adapter.format_input(model_input)

    if isinstance(prompt, str):
        generator = self.model(
            prompt,
            logits_processor=self.type_adapter.format_output_type(output_type),
            stream=True,
            **inference_kwargs,
        )
        for chunk in generator:
            yield chunk["choices"][0]["text"]

    elif isinstance(prompt, list):
        generator = self.model.create_chat_completion(
            prompt,
            logits_processor=self.type_adapter.format_output_type(output_type),
            stream=True,
            **inference_kwargs,
        )
        for chunk in generator:
            yield chunk["choices"][0]["delta"].get("content", "")
    else:  # Never reached  # pragma: no cover
        raise ValueError("Unexpected prompt type.")

LlamaCppTokenizer

Bases: Tokenizer

Source code in outlines/models/llamacpp.py
class LlamaCppTokenizer(Tokenizer):
    def __init__(self, model: "Llama"):
        self.tokenizer = model.tokenizer()
        self.special_tokens: Set[str] = set()
        self.vocabulary: Dict[str, int] = dict()

        # TODO: Remove when https://github.com/ggerganov/llama.cpp/pull/5613
        # is resolved
        self._hf_tokenizer = None
        if (
            hasattr(model, "tokenizer_")
            and hasattr(model.tokenizer_, "hf_tokenizer")
        ):
            self._hf_tokenizer = model.tokenizer_.hf_tokenizer
            self.eos_token_id = self._hf_tokenizer.eos_token_id
            self.eos_token = self._hf_tokenizer.eos_token
            self.vocabulary = self._hf_tokenizer.get_vocab()
        else:
            from llama_cpp import (
                llama_model_get_vocab,
                llama_token_to_piece,
            )

            self.eos_token_id = model.token_eos()
            size = 32
            buffer = (ctypes.c_char * size)()
            for i in range(model.n_vocab()):
                n = llama_token_to_piece(
                    llama_model_get_vocab(model.model),
                    i,
                    buffer,
                    size,
                    0,
                    True
                )
                token_piece = buffer[:n].decode("utf-8", errors="replace") # type: ignore
                self.vocabulary[token_piece] = i
                if i == self.eos_token_id:
                    self.eos_token = token_piece

        self.pad_token_id = self.eos_token_id
        # ensure stable ordering of vocabulary
        self.vocabulary = {
            tok: tok_id
            for tok, tok_id
            in sorted(self.vocabulary.items(), key=lambda x: x[1])
        }
        self._hash = None

    def decode(self, token_ids: List[int]) -> List[str]:
        decoded_bytes = self.tokenizer.detokenize(token_ids)
        return [decoded_bytes.decode("utf-8", errors="ignore")]

    def encode(
        self,
        prompt: Union[str, List[str]],
        add_bos: bool = True,
        special: bool = True,
    ) -> Tuple[List[int], List[int]]:
        if isinstance(prompt, list):
            raise NotImplementedError(
                "llama-cpp-python tokenizer doesn't support batch tokenization"
            )
        token_ids = self.tokenizer.tokenize(
            prompt.encode("utf-8", errors="ignore"),
            add_bos=add_bos,
            special=special,
        )
        # generate attention mask, missing from llama-cpp-python
        attention_mask = [
            1 if token_id != self.pad_token_id else 0 for token_id in token_ids
        ]
        return token_ids, attention_mask

    def convert_token_to_string(self, token: str) -> str:
        if self._hf_tokenizer is not None:
            from transformers.file_utils import SPIECE_UNDERLINE

            token_str = self._hf_tokenizer.convert_tokens_to_string([token])
            if (
                token.startswith(SPIECE_UNDERLINE)
                or token == "<0x20>"
            ):  # pragma: no cover
                token_str = " " + token_str
            return token_str
        else:
            return token

    def __eq__(self, other):
        if not isinstance(other, LlamaCppTokenizer):
            return False
        return self.__getstate__() == other.__getstate__()

    def __hash__(self):
        # We create a custom hash as pickle.dumps(self) is not stable
        if self._hash is None:
            self._hash = hash((
                tuple(sorted(self.vocabulary.items())),
                self.eos_token_id,
                self.eos_token,
                self.pad_token_id,
                tuple(sorted(self.special_tokens)),
            ))
        return self._hash

    def __getstate__(self):
        """Create a stable representation for outlines.caching"""
        return (
            self.vocabulary,
            self.eos_token_id,
            self.eos_token,
            self.pad_token_id,
            sorted(self.special_tokens),
        )

    def __setstate__(self, state):
        raise NotImplementedError("Cannot load a pickled llamacpp tokenizer")

__getstate__()

Create a stable representation for outlines.caching

Source code in outlines/models/llamacpp.py
def __getstate__(self):
    """Create a stable representation for outlines.caching"""
    return (
        self.vocabulary,
        self.eos_token_id,
        self.eos_token,
        self.pad_token_id,
        sorted(self.special_tokens),
    )

LlamaCppTypeAdapter

Bases: ModelTypeAdapter

Type adapter for the LlamaCpp model.

LlamaCppTypeAdapter is responsible for preparing the arguments to the Llama object text generation methods.

Source code in outlines/models/llamacpp.py
class LlamaCppTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `LlamaCpp` model.

    `LlamaCppTypeAdapter` is responsible for preparing the arguments to
    the `Llama` object text generation methods.

    """

    def __init__(self, has_chat_template: bool = False):
        """
        Parameters
        ----------
        has_chat_template
            Whether the model has a chat template defined.
        """
        self.has_chat_template = has_chat_template

    @singledispatchmethod
    def format_input(self, model_input):
        """Generate the prompt argument to pass to the model.

        Parameters
        ----------
        model_input
            The input provided by the user.

        Returns
        -------
        str
            The formatted input to be passed to the model.

        """
        raise NotImplementedError(
            f"The input type {type(model_input)} is not available with "
            "LlamaCpp. The only available types are `str` and `Chat`."
        )

    @format_input.register(str)
    def format_str_input(self, model_input: str) -> str | list:
        if self.has_chat_template:
            return [{"role": "user", "content": model_input}]
        return model_input

    @format_input.register(Chat)
    def format_chat_input(self, model_input: Chat) -> list:
        if not all(
            isinstance(message["content"], str)
            for message in model_input.messages
        ):
            raise ValueError(
                "LlamaCpp does not support multi-modal messages."
                + "The content of each message must be a string."
            )

        return  [
            {
                "role": message["role"],
                "content": message["content"],
            }
            for message in model_input.messages
        ]

    def format_output_type(
        self, output_type: Optional[OutlinesLogitsProcessor] = None,
    ) -> Optional["LogitsProcessorList"]:
        """Generate the logits processor argument to pass to the model.

        Parameters
        ----------
        output_type
            The logits processor provided.

        Returns
        -------
        LogitsProcessorList
            The logits processor to pass to the model.

        """
        from llama_cpp import LogitsProcessorList

        if output_type is not None:
            return LogitsProcessorList([output_type])
        return None

__init__(has_chat_template=False)

Parameters:

Name Type Description Default
has_chat_template bool

Whether the model has a chat template defined.

False
Source code in outlines/models/llamacpp.py
def __init__(self, has_chat_template: bool = False):
    """
    Parameters
    ----------
    has_chat_template
        Whether the model has a chat template defined.
    """
    self.has_chat_template = has_chat_template

format_input(model_input)

Generate the prompt argument to pass to the model.

Parameters:

Name Type Description Default
model_input

The input provided by the user.

required

Returns:

Type Description
str

The formatted input to be passed to the model.

Source code in outlines/models/llamacpp.py
@singledispatchmethod
def format_input(self, model_input):
    """Generate the prompt argument to pass to the model.

    Parameters
    ----------
    model_input
        The input provided by the user.

    Returns
    -------
    str
        The formatted input to be passed to the model.

    """
    raise NotImplementedError(
        f"The input type {type(model_input)} is not available with "
        "LlamaCpp. The only available types are `str` and `Chat`."
    )

format_output_type(output_type=None)

Generate the logits processor argument to pass to the model.

Parameters:

Name Type Description Default
output_type Optional[OutlinesLogitsProcessor]

The logits processor provided.

None

Returns:

Type Description
LogitsProcessorList

The logits processor to pass to the model.

Source code in outlines/models/llamacpp.py
def format_output_type(
    self, output_type: Optional[OutlinesLogitsProcessor] = None,
) -> Optional["LogitsProcessorList"]:
    """Generate the logits processor argument to pass to the model.

    Parameters
    ----------
    output_type
        The logits processor provided.

    Returns
    -------
    LogitsProcessorList
        The logits processor to pass to the model.

    """
    from llama_cpp import LogitsProcessorList

    if output_type is not None:
        return LogitsProcessorList([output_type])
    return None

from_llamacpp(model, chat_mode=True)

Create an Outlines LlamaCpp model instance from a llama_cpp.Llama instance.

Parameters:

Name Type Description Default
model Llama

A llama_cpp.Llama instance.

required
chat_mode bool

Whether to enable chat mode. If False, the model will regard all str inputs as plain text prompts. If True, the model will regard all str inputs as user messages in a chat conversation.

True

Returns:

Type Description
LlamaCpp

An Outlines LlamaCpp model instance.

Source code in outlines/models/llamacpp.py
def from_llamacpp(model: "Llama", chat_mode: bool = True) -> LlamaCpp:
    """Create an Outlines `LlamaCpp` model instance from a
    `llama_cpp.Llama` instance.

    Parameters
    ----------
    model
        A `llama_cpp.Llama` instance.
    chat_mode
        Whether to enable chat mode. If `False`, the model will regard
        all `str` inputs as plain text prompts. If `True`, the model will
        regard all `str` inputs as user messages in a chat conversation.

    Returns
    -------
    LlamaCpp
        An Outlines `LlamaCpp` model instance.

    """
    return LlamaCpp(model, chat_mode=chat_mode)