Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

- feat(server): Add model-load `chat_template_kwargs` support and document the CLI/config usage by @abetlen in #2168
- ci: Publish release wheels as `py3-none` by @Bing-su in #2166
- fix(ci): Publish distinct manylinux and musllinux CPU wheels by @abetlen in #2165

Expand Down
25 changes: 25 additions & 0 deletions docs/server.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@ The server can then be started by running the following command:
python3 -m llama_cpp.server --model <model_path>
```

You can also pass chat-template kwargs at model load time from the CLI:

```bash
python3 -m llama_cpp.server \
--model <model_path> \
--chat_format chatml \
--chat_template_kwargs '{"enable_thinking": true}'
```

### Server options

For a full list of options, run:
Expand Down Expand Up @@ -147,6 +156,22 @@ The server supports routing requests to multiple models based on the `model` par

At the moment only a single model is loaded into memory at, the server will automatically load and unload models as needed.

For a single-model config, `chat_template_kwargs` can be set directly on the model entry:

```json
{
"models": [
{
"model": "models/Qwen3.5-0.8B/qwen3.5-0.8b-q8_0.gguf",
"chat_format": "chatml",
"chat_template_kwargs": {
"enable_thinking": true
}
}
]
}
```

```json
{
"host": "0.0.0.0",
Expand Down
7 changes: 6 additions & 1 deletion llama_cpp/llama_chat_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ def raise_exception(message: str):
tools=tools,
tool_choice=tool_choice,
strftime_now=self.strftime_now,
**kwargs,
)

stopping_criteria = None
Expand Down Expand Up @@ -617,6 +618,7 @@ def chat_completion_handler(
function_call=function_call,
tools=tools,
tool_choice=tool_choice,
**kwargs,
)
prompt = llama.tokenize(
result.prompt.encode("utf-8"),
Expand Down Expand Up @@ -734,7 +736,9 @@ def format_autotokenizer(
**kwargs: Any,
) -> ChatFormatterResponse:
tokenizer.use_default_system_prompt = False # type: ignore
prompt: str = tokenizer.apply_chat_template(messages, tokenize=False) # type: ignore
prompt: str = tokenizer.apply_chat_template( # type: ignore
messages, tokenize=False, **kwargs
)
assert isinstance(prompt, str)
# Return formatted prompt and eos token by default
return ChatFormatterResponse(
Expand Down Expand Up @@ -791,6 +795,7 @@ def format_tokenizer_config(
messages=messages,
bos_token=bos_token,
eos_token=eos_token,
**kwargs,
)
return ChatFormatterResponse(
prompt=prompt, stop=[eos_token, bos_token], added_special=True
Expand Down
34 changes: 32 additions & 2 deletions llama_cpp/server/cli.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from __future__ import annotations

import argparse
import json

from typing import List, Literal, Union, Any, Type, TypeVar
from typing import List, Literal, Union, Any, Type, TypeVar, Dict

from pydantic import BaseModel

Expand Down Expand Up @@ -40,6 +41,17 @@ def _contains_list_type(annotation: Type[Any] | None) -> bool:
return False


def _contains_dict_type(annotation: Type[Any] | None) -> bool:
origin = getattr(annotation, "__origin__", None)

if origin is dict or origin is Dict:
return True
elif origin in (Literal, Union):
return any(_contains_dict_type(arg) for arg in annotation.__args__) # type: ignore
else:
return False


def _parse_bool_arg(arg: str | bytes | bool) -> bool:
if isinstance(arg, bytes):
arg = arg.decode("utf-8")
Expand All @@ -57,6 +69,16 @@ def _parse_bool_arg(arg: str | bytes | bool) -> bool:
raise ValueError(f"Invalid boolean argument: {arg}")


def _parse_json_object_arg(arg: str | bytes) -> dict[str, Any]:
if isinstance(arg, bytes):
arg = arg.decode("utf-8")

value = json.loads(arg)
if not isinstance(value, dict):
raise ValueError(f"Invalid JSON object argument: {arg}")
return value


def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel]):
"""Add arguments from a pydantic model to an argparse parser."""

Expand All @@ -68,7 +90,15 @@ def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel])
_get_base_type(field.annotation) if field.annotation is not None else str
)
list_type = _contains_list_type(field.annotation)
if base_type is not bool:
dict_type = _contains_dict_type(field.annotation)
if dict_type:
parser.add_argument(
f"--{name}",
dest=name,
type=_parse_json_object_arg,
help=description,
)
elif base_type is not bool:
parser.add_argument(
f"--{name}",
dest=name,
Expand Down
15 changes: 15 additions & 0 deletions llama_cpp/server/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,21 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
# Misc
verbose=settings.verbose,
)
if settings.chat_template_kwargs:
base_chat_handler = (
_model.chat_handler
or _model._chat_handlers.get(_model.chat_format)
or llama_cpp.llama_chat_format.get_chat_completion_handler(
_model.chat_format
)
)

def chat_handler_with_kwargs(*args, **kwargs):
return base_chat_handler(
*args, **{**settings.chat_template_kwargs, **kwargs}
)

_model.chat_handler = chat_handler_with_kwargs
if settings.cache:
if settings.cache_type == "disk":
if settings.verbose:
Expand Down
6 changes: 5 additions & 1 deletion llama_cpp/server/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import multiprocessing

from typing import Optional, List, Literal, Union, Dict, cast
from typing import Any, Optional, List, Literal, Union, Dict, cast
from typing_extensions import Self

from pydantic import Field, model_validator
Expand Down Expand Up @@ -131,6 +131,10 @@ class ModelSettings(BaseSettings):
default=None,
description="Chat format to use.",
)
chat_template_kwargs: Optional[Dict[str, Any]] = Field(
default=None,
description="Extra keyword arguments forwarded to chat templates at model load time. Matches llama.cpp server `chat_template_kwargs`.",
)
clip_model_path: Optional[str] = Field(
default=None,
description="Path to a CLIP model to use for multi-modal chat completion.",
Expand Down
Loading