Skip to content

Commit 7828382

Browse files
committed
Add server support
1 parent fcb8051 commit 7828382

File tree

2 files changed

+12
-0
lines changed

2 files changed

+12
-0
lines changed

llama_cpp/server/model.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,9 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
175175
chat_handler=chat_handler,
176176
# Speculative Decoding
177177
draft_model=draft_model,
178+
# KV Cache Quantization
179+
type_k=settings.type_k,
180+
type_v=settings.type_v,
178181
# Tokenizer
179182
tokenizer=tokenizer,
180183
# Misc

llama_cpp/server/settings.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,15 @@ class ModelSettings(BaseSettings):
159159
default=10,
160160
description="Number of tokens to predict using the draft model.",
161161
)
162+
# KV Cache Quantization
163+
type_k: Optional[int] = Field(
164+
default=None,
165+
description="Type of the key cache quantization.",
166+
)
167+
type_v: Optional[int] = Field(
168+
default=None,
169+
description="Type of the value cache quantization.",
170+
)
162171
# Misc
163172
verbose: bool = Field(
164173
default=True, description="Whether to print debug information."

0 commit comments

Comments
 (0)