Skip to content

Commit e8b4f32

Browse files
committed
passthru rpc_servers params
wip
1 parent 951e39c commit e8b4f32

File tree

1 file changed

+3
-0
lines changed

1 file changed

+3
-0
lines changed

llama_cpp/llama.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ def __init__(
7171
split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
7272
main_gpu: int = 0,
7373
tensor_split: Optional[List[float]] = None,
74+
rpc_servers: Optional[str] = None,
7475
vocab_only: bool = False,
7576
use_mmap: bool = True,
7677
use_mlock: bool = False,
@@ -149,6 +150,7 @@ def __init__(
149150
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
150151
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
151152
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
153+
rpc_servers: Comma separated list of RPC servers to use for offloading
152154
vocab_only: Only load the vocabulary no weights.
153155
use_mmap: Use mmap if possible.
154156
use_mlock: Force the system to keep the model in RAM.
@@ -220,6 +222,7 @@ def __init__(
220222
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
221223
self.model_params.split_mode = split_mode
222224
self.model_params.main_gpu = main_gpu
225+
self.model_params.rpc_servers = rpc_servers
223226
self.tensor_split = tensor_split
224227
self._c_tensor_split = None
225228
if self.tensor_split is not None:

0 commit comments

Comments
 (0)