@@ -764,7 +764,7 @@ class llama_context_params(ctypes.Structure):
764
764
cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
765
765
type_k (int): data type for K cache
766
766
type_v (int): data type for V cache
767
- logits_all (bool): the llama_eval () call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
767
+ logits_all (bool): the llama_decode () call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
768
768
embeddings (bool): if true, extract embeddings (together with logits)
769
769
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
770
770
flash_attn (bool): whether to use flash attention
@@ -2453,10 +2453,10 @@ def llama_synchronize(ctx: llama_context_p, /):
2453
2453
"llama_get_logits" , [llama_context_p_ctypes ], ctypes .POINTER (ctypes .c_float )
2454
2454
)
2455
2455
def llama_get_logits (ctx : llama_context_p , / ) -> CtypesArray [ctypes .c_float ]:
2456
- """Token logits obtained from the last call to llama_eval ()
2457
- The logits for the last token are stored in the last row
2458
- Logits for which llama_batch.logits[i] == 0 are undefined
2459
- Rows: n_tokens provided with llama_batch
2456
+ """Token logits obtained from the last call to llama_decode ()
2457
+ The logits for which llama_batch.logits[i] != 0 are stored contiguously
2458
+ in the order they have appeared in the batch.
2459
+ Rows: number of tokens for which llama_batch.logits[i] != 0
2460
2460
Cols: n_vocab
2461
2461
2462
2462
Returns:
0 commit comments