Add server support

abetlen · abetlen · commit 7828382778ac · 2024-04-01T10:19:01.000-04:00
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -175,6 +175,9 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             chat_handler=chat_handler,
             # Speculative Decoding
             draft_model=draft_model,
+            # KV Cache Quantization
+            type_k=settings.type_k,
+            type_v=settings.type_v,
             # Tokenizer
             tokenizer=tokenizer,
             # Misc
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -159,6 +159,15 @@ class ModelSettings(BaseSettings):
         default=10,
         description="Number of tokens to predict using the draft model.",
     )
+    # KV Cache Quantization
+    type_k: Optional[int] = Field(
+        default=None,
+        description="Type of the key cache quantization.",
+    )
+    type_v: Optional[int] = Field(
+        default=None,
+        description="Type of the value cache quantization.",
+    )
     # Misc
     verbose: bool = Field(
         default=True, description="Whether to print debug information."