File tree Expand file tree Collapse file tree 2 files changed +12
-0
lines changed Expand file tree Collapse file tree 2 files changed +12
-0
lines changed Original file line number Diff line number Diff line change @@ -175,6 +175,9 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
175
175
chat_handler = chat_handler ,
176
176
# Speculative Decoding
177
177
draft_model = draft_model ,
178
+ # KV Cache Quantization
179
+ type_k = settings .type_k ,
180
+ type_v = settings .type_v ,
178
181
# Tokenizer
179
182
tokenizer = tokenizer ,
180
183
# Misc
Original file line number Diff line number Diff line change @@ -159,6 +159,15 @@ class ModelSettings(BaseSettings):
159
159
default = 10 ,
160
160
description = "Number of tokens to predict using the draft model." ,
161
161
)
162
+ # KV Cache Quantization
163
+ type_k : Optional [int ] = Field (
164
+ default = None ,
165
+ description = "Type of the key cache quantization." ,
166
+ )
167
+ type_v : Optional [int ] = Field (
168
+ default = None ,
169
+ description = "Type of the value cache quantization." ,
170
+ )
162
171
# Misc
163
172
verbose : bool = Field (
164
173
default = True , description = "Whether to print debug information."
You can’t perform that action at this time.
0 commit comments