performant reproduction with tuning

hodlen · hodlen · commit 9652372f1e16 · 2023-12-24T00:03:13.000+08:00
diff --git a/examples/high_level_api/high_level_api_inference.py b/examples/high_level_api/high_level_api_inference.py
@@ -5,13 +5,16 @@
 
 parser = argparse.ArgumentParser()
 parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin")
+parser.add_argument("-i", "--path_idx", type=str)
+parser.add_argument("-ngl", "--n_gpu_layers", type=int)
+
 args = parser.parse_args()
 
-llm = Llama(model_path=args.model)
+llm = Llama(model_path=args.model, n_gpu_layers=args.n_gpu_layers, path_idx=args.path_idx, n_ctx=128, n_batch=1)
 
 output = llm(
     "Question: What are the names of the planets in the solar system? Answer: ",
-    max_tokens=512,
+    max_tokens=128,
     stop=["Q:", "\n"],
     echo=True,
 )
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -221,6 +221,7 @@ def __init__(
         path_model: str,
         params: llama_cpp.llama_model_params,
         verbose: bool = True,
+        path_idx: Optional[str] = None,
     ):
         self.path_model = path_model
         self.params = params
@@ -235,6 +236,11 @@ def __init__(
             self.model = llama_cpp.llama_load_model_from_file(
                 self.path_model.encode("utf-8"), self.params
             )
+            if path_idx:
+                llama_cpp.llama_model_apply_mlp_from_file(
+                     self.model, path_idx.encode("utf-8"), True
+                )
+            llama_cpp.llama_model_apply_augmentation(self.model)
 
     def __del__(self):
         with suppress_stdout_stderr(disable=self.verbose):
@@ -761,6 +767,8 @@ def __init__(
         chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
         # Misc
         verbose: bool = True,
+        # GPU index
+        path_idx: Optional[str] = None,
         # Extra Params
         **kwargs,  # type: ignore
     ):
@@ -887,7 +895,8 @@ def __init__(
             raise ValueError(f"Model path does not exist: {model_path}")
 
         self._model = _LlamaModel(
-            path_model=self.model_path, params=self.model_params, verbose=self.verbose
+            path_model=self.model_path, params=self.model_params, verbose=self.verbose,
+            path_idx=path_idx,
         )
 
         self._ctx = _LlamaContext(
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -305,15 +305,12 @@ class llama_model_params(Structure):
     _fields_ = [
         ("n_gpu_layers", c_int32),
         ("main_gpu", c_int32),
-        ("vram_budget_gb", c_float),
         ("tensor_split", c_float_p),
         ("progress_callback", llama_progress_callback),
         ("progress_callback_user_data", c_void_p),
         ("vocab_only", c_bool),
         ("use_mmap", c_bool),
         ("use_mlock", c_bool),
-        ("reset_gpu_index", c_bool),
-        ("disable_gpu_index", c_bool),
     ]
 
 
@@ -555,6 +552,21 @@ def llama_new_context_with_model(
 _lib.llama_new_context_with_model.restype = llama_context_p
 
 
+def llama_model_apply_mlp_from_file(
+    model: llama_model_p, path: bytes, use_mmap: Union[c_bool, bool]
+):
+    _lib.llama_model_apply_mlp_from_file(model, path, use_mmap)
+
+_lib.llama_model_apply_mlp_from_file.argtypes = [llama_model_p, c_char_p, c_bool]
+_lib.llama_model_apply_mlp_from_file.restype = None
+
+def llama_model_apply_augmentation(model: llama_model_p):
+    _lib.llama_model_apply_augmentation(model)
+
+_lib.llama_model_apply_augmentation.argtypes = [llama_model_p]
+_lib.llama_model_apply_augmentation.restype = None
+
+
 # // Frees all allocated memory
 # LLAMA_API void llama_free(struct llama_context * ctx);
 def llama_free(ctx: llama_context_p):
diff --git a/vendor/PowerInfer b/vendor/PowerInfer
@@ -1 +1 @@
-Subproject commit 9d726685d299b669ffb6197f6b0e72b21ebf019c
+Subproject commit a3c295a2ae9c57edca9166bda5685a7406fa8a16