Merge branch 'abetlen:main' into main

aniljava · web-flow · commit 3d0a07924c51 · 2024-01-15T12:34:25.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.29]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@4483396751c79dea540808b9cb9238245d06da2b
+- feat: Add split_mode option by @abetlen in 84615adbc6855c8384807c42f0130f9a1763f99d
+- feat: Implement GGUF metadata KV overrides by @phiharri in #1011
+- fix: Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor by @yieldthought in #1012
+- fix: Fix low_level_api_chat_cpp example to match current API by @aniljava in #1086
+- fix: Fix Pydantic model parsing by @DeNeutoy in #1087
+
 ## [0.2.28]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@6efb8eb30e7025b168f3fda3ff83b9b386428ad6
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.28"
+__version__ = "0.2.29"
diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py
@@ -1,11 +1,15 @@
 import os
 import sys
 
+import sys, traceback
+
+# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
+outnull_file = open(os.devnull, "w")
+errnull_file = open(os.devnull, "w")
 
 class suppress_stdout_stderr(object):
     # NOTE: these must be "saved" here to avoid exceptions when using
     #       this context manager inside of a __del__ method
-    open = open
     sys = sys
     os = os
 
@@ -21,9 +25,6 @@ def __enter__(self):
         if not hasattr(self.sys.stdout, 'fileno') or not hasattr(self.sys.stderr, 'fileno'):
             return self  # Return the instance without making changes
 
-        self.outnull_file = self.open(self.os.devnull, "w")
-        self.errnull_file = self.open(self.os.devnull, "w")
-
         self.old_stdout_fileno_undup = self.sys.stdout.fileno()
         self.old_stderr_fileno_undup = self.sys.stderr.fileno()
 
@@ -33,11 +34,11 @@ def __enter__(self):
         self.old_stdout = self.sys.stdout
         self.old_stderr = self.sys.stderr
 
-        self.os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
-        self.os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
+        self.os.dup2(outnull_file.fileno(), self.old_stdout_fileno_undup)
+        self.os.dup2(errnull_file.fileno(), self.old_stderr_fileno_undup)
 
-        self.sys.stdout = self.outnull_file
-        self.sys.stderr = self.errnull_file
+        self.sys.stdout = outnull_file
+        self.sys.stderr = errnull_file
         return self
 
     def __exit__(self, *_):
@@ -54,6 +55,3 @@ def __exit__(self, *_):
 
             self.os.close(self.old_stdout_fileno)
             self.os.close(self.old_stderr_fileno)
-
-            self.outnull_file.close()
-            self.errnull_file.close()
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -730,11 +730,13 @@ def __init__(
         *,
         # Model Params
         n_gpu_layers: int = 0,
+        split_mode: int = llama_cpp.LLAMA_SPLIT_LAYER,
         main_gpu: int = 0,
         tensor_split: Optional[List[float]] = None,
         vocab_only: bool = False,
         use_mmap: bool = True,
         use_mlock: bool = False,
+        kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None,
         # Context Params
         seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
         n_ctx: int = 512,
@@ -798,11 +800,13 @@ def __init__(
         Args:
             model_path: Path to the model.
             n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
-            main_gpu: The GPU that is used for scratch and small tensors.
+            split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
+            main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
             tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
             vocab_only: Only load the vocabulary no weights.
             use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
+            kv_overrides: Key-value overrides for the model.
             seed: RNG seed, -1 for random
             n_ctx: Text context, 0 = from model
             n_batch: Prompt processing maximum batch size
@@ -848,6 +852,7 @@ def __init__(
         self.model_params.n_gpu_layers = (
             0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers
         )  # 0x7FFFFFFF is INT32 max, will be auto set to all layers
+        self.model_params.split_mode = split_mode
         self.model_params.main_gpu = main_gpu
         self.tensor_split = tensor_split
         self._c_tensor_split = None
@@ -866,6 +871,34 @@ def __init__(
         self.model_params.use_mmap = use_mmap if lora_path is None else False
         self.model_params.use_mlock = use_mlock
 
+        self.kv_overrides = kv_overrides
+        if kv_overrides is not None:
+            n_overrides = len(kv_overrides)
+            self._kv_overrides_array = llama_cpp.llama_model_kv_override * (n_overrides + 1)
+            self._kv_overrides_array_keys = []
+
+            for k, v in kv_overrides.items():
+                key_buf = ctypes.create_string_buffer(k.encode("utf-8"))
+                self._kv_overrides_array_keys.append(key_buf)
+                self._kv_overrides_array[i].key = key_buf
+                if isinstance(v, int):
+                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT
+                    self._kv_overrides_array[i].value.int_value = v
+                elif isinstance(v, float):
+                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT
+                    self._kv_overrides_array[i].value.float_value = v
+                elif isinstance(v, bool):
+                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
+                    self._kv_overrides_array[i].value.bool_value = v
+                else:
+                    raise ValueError(f"Unknown value type for {k}: {v}")
+
+            self._kv_overrides_array_sentinel_key = b'\0'
+
+            # null array sentinel
+            self._kv_overrides_array[n_overrides].key = self._kv_overrides_array_sentinel_key
+            self.model_params.kv_overrides = self._kv_overrides_array
+
         self.n_batch = min(n_ctx, n_batch)  # ???
         self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
         self.n_threads_batch = n_threads_batch or max(
@@ -2143,11 +2176,13 @@ def __getstate__(self):
             model_path=self.model_path,
             # Model Params
             n_gpu_layers=self.model_params.n_gpu_layers,
+            split_mode=self.model_params.split_mode,
             main_gpu=self.model_params.main_gpu,
             tensor_split=self.tensor_split,
             vocab_only=self.model_params.vocab_only,
             use_mmap=self.model_params.use_mmap,
             use_mlock=self.model_params.use_mlock,
+            kv_overrides=self.kv_overrides,
             # Context Params
             seed=self.context_params.seed,
             n_ctx=self.context_params.n_ctx,
@@ -2185,11 +2220,13 @@ def __setstate__(self, state):
             model_path=state["model_path"],
             # Model Params
             n_gpu_layers=state["n_gpu_layers"],
+            split_mode=state["split_mode"],
             main_gpu=state["main_gpu"],
             tensor_split=state["tensor_split"],
             vocab_only=state["vocab_only"],
             use_mmap=state["use_mmap"],
             use_mlock=state["use_mlock"],
+            kv_overrides=state["kv_overrides"],
             # Context Params
             seed=state["seed"],
             n_ctx=state["n_ctx"],
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -229,6 +229,7 @@ def _load_shared_library(lib_base_name: str):
 LLAMA_SPLIT_LAYER = 1
 LLAMA_SPLIT_ROW = 2
 
+
 # typedef struct llama_token_data {
 #     llama_token id; // token id
 #     float logit;    // log-odds of the token
@@ -395,6 +396,7 @@ class llama_model_kv_override(Structure):
 #     // override key-value pairs of the model meta data
 #     const struct llama_model_kv_override * kv_overrides;
 
+
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool vocab_only; // only load the vocabulary, no weights
 #     bool use_mmap;   // use mmap if possible
@@ -407,7 +409,7 @@ class llama_model_params(Structure):
         n_gpu_layers (int): number of layers to store in VRAM
         split_mode (int): how to split the model across multiple GPUs
         main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
-        tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES 
+        tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
         progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
         progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
         kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
@@ -526,6 +528,7 @@ class llama_context_params(Structure):
 #     bool quantize_output_tensor; // quantize output.weight
 #     bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
 #     bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
+#     void * imatrix;              // pointer to importance matrix data
 # } llama_model_quantize_params;
 class llama_model_quantize_params(Structure):
     """Parameters for llama_model_quantize
@@ -537,6 +540,7 @@ class llama_model_quantize_params(Structure):
         quantize_output_tensor (bool): quantize output.weight
         only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
         pure (bool): disable k-quant mixtures and quantize all tensors to the same type
+        imatrix (ctypes.c_void_p): pointer to importance matrix data
     """
 
     _fields_ = [
@@ -545,6 +549,8 @@ class llama_model_quantize_params(Structure):
         ("allow_requantize", c_bool),
         ("quantize_output_tensor", c_bool),
         ("only_copy", c_bool),
+        ("pure", c_bool),
+        ("imatrix", c_void_p),
     ]
 
 
@@ -1956,14 +1962,39 @@ def llama_sample_repetition_penalties(
 
 
 # /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
-# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
-# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
-# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
-# LLAMA_API void llama_sample_classifier_free_guidance(
-#             struct llama_context * ctx,
+# /// @param logits Logits extracted from the original generation context.
+# /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+# /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+# LLAMA_API void llama_sample_apply_guidance(
+#           struct llama_context * ctx,
+#                          float * logits,
+#                          float * logits_guidance,
+#                          float   scale);
+def llama_sample_apply_guidance(
+    ctx: llama_context_p,
+    logits,  # type: _Pointer[c_float]
+    logits_guidance,  # type: _Pointer[c_float]
+    scale: Union[c_float, float],
+):
+    """Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806"""
+    return _lib.llama_sample_apply_guidance(ctx, logits, logits_guidance, scale)
+
+
+_lib.llama_sample_apply_guidance.argtypes = [
+    llama_context_p,
+    c_float_p,
+    c_float_p,
+    c_float,
+]
+_lib.llama_sample_apply_guidance.restype = None
+
+
+# LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
+#           struct llama_context * ctx,
 #         llama_token_data_array * candidates,
-#             struct llama_context * guidance_ctx,
-#                             float   scale);
+#           struct llama_context * guidance_ctx,
+#                          float   scale),
+#           "use llama_sample_apply_guidance() instead");
 def llama_sample_classifier_free_guidance(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
diff --git a/llama_cpp/llama_grammar.py b/llama_cpp/llama_grammar.py
@@ -1433,7 +1433,6 @@ def _add_rule(self, name: str, rule: str):
 
     def visit(self, schema: Dict[str, Any], name: str) -> str:
         schema_type: Optional[str] = schema.get("type") # type: ignore
-        assert isinstance(schema_type, str), f"Unrecognized schema: {schema}"
         rule_name = name or "root"
 
         if "$defs" in schema:
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Optional, Union, List
+from typing import Dict, Optional, Union, List
 
 import llama_cpp
 
@@ -71,6 +71,23 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(
                 clip_model_path=settings.clip_model_path, verbose=settings.verbose
             )
+        
+        kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None
+        if settings.kv_overrides is not None:
+            assert isinstance(settings.kv_overrides, list)
+            kv_overrides = {}
+            for kv in settings.kv_overrides:
+                key, value = kv.split("=")
+                if ":" in value:
+                    value_type, value = value.split(":")
+                    if value_type == "bool":
+                        kv_overrides[key] = value.lower() in ["true", "1"]
+                    elif value_type == "int":
+                        kv_overrides[key] = int(value)
+                    elif value_type == "float":
+                        kv_overrides[key] = float(value)
+                    else:
+                        raise ValueError(f"Unknown value type {value_type}")
 
         _model = llama_cpp.Llama(
             model_path=settings.model,
@@ -81,6 +98,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             vocab_only=settings.vocab_only,
             use_mmap=settings.use_mmap,
             use_mlock=settings.use_mlock,
+            kv_overrides=kv_overrides,
             # Context Params
             seed=settings.seed,
             n_ctx=settings.n_ctx,
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -28,6 +28,10 @@ class ModelSettings(BaseSettings):
         ge=-1,
         description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
     )
+    split_mode: int = Field(
+        default=llama_cpp.LLAMA_SPLIT_LAYER,
+        description="The split mode to use.",
+    )
     main_gpu: int = Field(
         default=0,
         ge=0,
@@ -48,6 +52,10 @@ class ModelSettings(BaseSettings):
         default=llama_cpp.llama_mlock_supported(),
         description="Use mlock.",
     )
+    kv_overrides: Optional[List[str]] = Field(
+        default=None,
+        description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
+    )
     # Context Params
     seed: int = Field(
         default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
diff --git a/tests/test_grammar.py b/tests/test_grammar.py
@@ -1,13 +1,52 @@
 import llama_cpp
+import json
 
 tree = """
 leaf ::= "."
 node ::= leaf | "(" node node ")"
 root ::= node
 """
 
+
 def test_grammar_from_string():
     grammar = llama_cpp.LlamaGrammar.from_string(tree)
     assert grammar._n_rules == 3
     assert grammar._start_rule_index == 2
     assert grammar.grammar is not None
+
+
+def test_composed_pydantic_grammar():
+    """
+    from pydantic import BaseModel
+
+    class A(BaseModel):
+        a: int
+
+    class B(BaseModel):
+        a: A
+        b: int
+    """
+
+    # This schema corresponds to the grammar in the comment above.
+    # We don't use the pydantic models directly to avoid the dependency.
+    schema = {
+        "$defs": {
+            "A": {
+                "properties": {"a": {"title": "A", "type": "integer"}},
+                "required": ["a"],
+                "title": "A",
+                "type": "object",
+            }
+        },
+        "properties": {
+            "a": {"$ref": "#/$defs/A"},
+            "b": {"title": "B", "type": "integer"},
+        },
+        "required": ["a", "b"],
+        "title": "B",
+        "type": "object",
+    }
+
+    grammar = llama_cpp.LlamaGrammar.from_json_schema(json.dumps(schema))
+
+    assert grammar.grammar is not None
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 76484fbfd355df388f71d6edaa98e1692a74de7e
+Subproject commit 4483396751c79dea540808b9cb9238245d06da2b