@@ -13,12 +13,15 @@ class Llama:
13
13
def __init__ (
14
14
self ,
15
15
model_path : str ,
16
+ # NOTE: The following parameters are likely to change in the future.
16
17
n_ctx : int = 512 ,
17
18
n_parts : int = - 1 ,
18
19
seed : int = 1337 ,
19
20
f16_kv : bool = False ,
20
21
logits_all : bool = False ,
21
22
vocab_only : bool = False ,
23
+ use_mlock : bool = False ,
24
+ embedding : bool = False ,
22
25
n_threads : Optional [int ] = None ,
23
26
) -> "Llama" :
24
27
"""Load a llama.cpp model from `model_path`.
@@ -31,6 +34,8 @@ def __init__(
31
34
f16_kv: Use half-precision for key/value cache.
32
35
logits_all: Return logits for all tokens, not just the last token.
33
36
vocab_only: Only load the vocabulary no weights.
37
+ use_mlock: Force the system to keep the model in RAM.
38
+ embedding: Embedding mode only.
34
39
n_threads: Number of threads to use. If None, the number of threads is automatically determined.
35
40
36
41
Raises:
@@ -51,6 +56,8 @@ def __init__(
51
56
self .params .f16_kv = f16_kv
52
57
self .params .logits_all = logits_all
53
58
self .params .vocab_only = vocab_only
59
+ self .params .use_mlock = use_mlock
60
+ self .params .embedding = embedding
54
61
55
62
self .n_threads = n_threads or multiprocessing .cpu_count ()
56
63
0 commit comments