You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+3Lines changed: 3 additions & 0 deletions
Original file line number
Diff line number
Diff line change
@@ -71,6 +71,7 @@ def __init__(
71
71
split_mode: int=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
72
72
main_gpu: int=0,
73
73
tensor_split: Optional[List[float]] =None,
74
+
rpc_servers: Optional[str] =None,
74
75
vocab_only: bool=False,
75
76
use_mmap: bool=True,
76
77
use_mlock: bool=False,
@@ -149,6 +150,7 @@ def __init__(
149
150
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
150
151
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
151
152
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
153
+
rpc_servers: Comma separated list of RPC servers to use for offloading
152
154
vocab_only: Only load the vocabulary no weights.
153
155
use_mmap: Use mmap if possible.
154
156
use_mlock: Force the system to keep the model in RAM.
@@ -220,6 +222,7 @@ def __init__(
220
222
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
0 commit comments