Skip to content

Commit ba3496c

Browse files
committed
Merge branch 'main' into add-functionary-support
2 parents e2d5e95 + acf50f1 commit ba3496c

File tree

6 files changed

+98
-56
lines changed

6 files changed

+98
-56
lines changed

llama_cpp/llama.py

Lines changed: 27 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,8 @@ def __init__(
308308
self.tensor_split = tensor_split
309309
self._p_tensor_split = None
310310
if self.tensor_split is not None:
311+
if len(self.tensor_split) > llama_cpp.LLAMA_MAX_DEVICES:
312+
raise ValueError(f"Attempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES={llama_cpp.LLAMA_MAX_DEVICES}")
311313
# Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
312314
FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES
313315
self._c_tensor_split = FloatArray(
@@ -442,7 +444,7 @@ def eval_logits(self) -> Deque[List[float]]:
442444
maxlen=self._n_ctx if self.context_params.logits_all else 1,
443445
)
444446

445-
def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
447+
def tokenize(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]:
446448
"""Tokenize a string.
447449
448450
Args:
@@ -464,6 +466,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
464466
tokens,
465467
n_ctx,
466468
add_bos,
469+
special
467470
)
468471
if n_tokens < 0:
469472
n_tokens = abs(n_tokens)
@@ -475,6 +478,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
475478
tokens,
476479
n_tokens,
477480
add_bos,
481+
special
478482
)
479483
if n_tokens < 0:
480484
raise RuntimeError(
@@ -1228,20 +1232,6 @@ def _create_completion(
12281232
}
12291233
],
12301234
}
1231-
yield {
1232-
"id": completion_id,
1233-
"object": "text_completion",
1234-
"created": created,
1235-
"model": model_name,
1236-
"choices": [
1237-
{
1238-
"text": "",
1239-
"index": 0,
1240-
"logprobs": None,
1241-
"finish_reason": finish_reason,
1242-
}
1243-
],
1244-
}
12451235
break
12461236
returned_tokens += 1
12471237
yield {
@@ -1260,20 +1250,20 @@ def _create_completion(
12601250
}
12611251
],
12621252
}
1263-
yield {
1264-
"id": completion_id,
1265-
"object": "text_completion",
1266-
"created": created,
1267-
"model": model_name,
1268-
"choices": [
1269-
{
1270-
"text": "",
1271-
"index": 0,
1272-
"logprobs": None,
1273-
"finish_reason": finish_reason,
1274-
}
1275-
],
1276-
}
1253+
yield {
1254+
"id": completion_id,
1255+
"object": "text_completion",
1256+
"created": created,
1257+
"model": model_name,
1258+
"choices": [
1259+
{
1260+
"text": "",
1261+
"index": 0,
1262+
"logprobs": None,
1263+
"finish_reason": finish_reason,
1264+
}
1265+
],
1266+
}
12771267
if self.cache:
12781268
if self.verbose:
12791269
print("Llama._create_completion: cache save", file=sys.stderr)
@@ -1573,14 +1563,21 @@ def create_chat_completion(
15731563
grammar=grammar,
15741564
)
15751565

1576-
def __del__(self):
1566+
def _free_model(self):
15771567
if hasattr(self, "model") and self.model is not None:
15781568
llama_cpp.llama_free_model(self.model)
15791569
self.model = None
15801570
if hasattr(self, "ctx") and self.ctx is not None:
15811571
llama_cpp.llama_free(self.ctx)
15821572
self.ctx = None
15831573

1574+
def __del__(self):
1575+
if self.verbose:
1576+
self._free_model()
1577+
else:
1578+
with suppress_stdout_stderr():
1579+
self._free_model()
1580+
15841581
def __getstate__(self):
15851582
return dict(
15861583
model_path=self.model_path,

llama_cpp/llama_chat_format.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def _format_llama2(
7979
ret = system_message + sep
8080
for role, message in messages:
8181
if message:
82-
ret += message + " "
82+
ret += role + message + " "
8383
else:
8484
ret += role + " "
8585
return ret

llama_cpp/llama_cpp.py

Lines changed: 55 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -240,11 +240,11 @@ class llama_token_data_array(Structure):
240240
# typedef struct llama_batch {
241241
# int32_t n_tokens;
242242

243-
# llama_token * token;
244-
# float * embd;
245-
# llama_pos * pos;
246-
# llama_seq_id * seq_id;
247-
# int8_t * logits;
243+
# llama_token * token;
244+
# float * embd;
245+
# llama_pos * pos;
246+
# llama_seq_id ** seq_id;
247+
# int8_t * logits;
248248

249249

250250
# // NOTE: helpers for smooth API transition - can be deprecated in the future
@@ -262,7 +262,7 @@ class llama_batch(Structure):
262262
("token", POINTER(llama_token)),
263263
("embd", c_float_p),
264264
("pos", POINTER(llama_pos)),
265-
("seq_id", POINTER(llama_seq_id)),
265+
("seq_id", POINTER(POINTER(llama_seq_id))),
266266
("logits", POINTER(c_int8)),
267267
("all_pos_0", llama_pos),
268268
("all_pos_1", llama_pos),
@@ -1069,22 +1069,26 @@ def llama_batch_get_one(
10691069
_lib.llama_batch_get_one.restype = llama_batch
10701070

10711071

1072-
# // Allocates a batch of tokens on the heap
1072+
# // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
1073+
# // Each token can be assigned up to n_seq_max sequence ids
10731074
# // The batch has to be freed with llama_batch_free()
10741075
# // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
10751076
# // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
10761077
# // The rest of the llama_batch members are allocated with size n_tokens
10771078
# // All members are left uninitialized
10781079
# LLAMA_API struct llama_batch llama_batch_init(
10791080
# int32_t n_tokens,
1080-
# int32_t embd);
1081+
# int32_t embd,
1082+
# int32_t n_seq_max);
10811083
def llama_batch_init(
1082-
n_tokens: Union[c_int, int], embd: Union[c_int, int]
1084+
n_tokens: Union[c_int32, int],
1085+
embd: Union[c_int32, int],
1086+
n_seq_max: Union[c_int32, int],
10831087
) -> llama_batch:
1084-
return _lib.llama_batch_init(n_tokens, embd)
1088+
return _lib.llama_batch_init(n_tokens, embd, n_seq_max)
10851089

10861090

1087-
_lib.llama_batch_init.argtypes = [c_int, c_int]
1091+
_lib.llama_batch_init.argtypes = [c_int32, c_int32, c_int32]
10881092
_lib.llama_batch_init.restype = llama_batch
10891093

10901094

@@ -1308,6 +1312,46 @@ def llama_tokenize(
13081312
_lib.llama_tokenize.restype = c_int
13091313

13101314

1315+
# /// @details Convert the provided text into tokens.
1316+
# /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
1317+
# /// @return Returns the number of tokens on success, no more than n_max_tokens
1318+
# /// @return Returns a negative number on failure - the number of tokens that would have been returned
1319+
# /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
1320+
# /// Does not insert a leading space.
1321+
# LLAMA_API int llama_tokenize(
1322+
# const struct llama_model * model,
1323+
# const char * text,
1324+
# int text_len,
1325+
# llama_token * tokens,
1326+
# int n_max_tokens,
1327+
# bool add_bos,
1328+
# bool special);
1329+
def llama_tokenize(
1330+
model: llama_model_p,
1331+
text: bytes,
1332+
text_len: Union[c_int, int],
1333+
tokens, # type: Array[llama_token]
1334+
n_max_tokens: Union[c_int, int],
1335+
add_bos: Union[c_bool, bool],
1336+
special: Union[c_bool, bool],
1337+
) -> int:
1338+
return _lib.llama_tokenize(
1339+
model, text, text_len, tokens, n_max_tokens, add_bos, special
1340+
)
1341+
1342+
1343+
_lib.llama_tokenize.argtypes = [
1344+
llama_model_p,
1345+
c_char_p,
1346+
c_int,
1347+
llama_token_p,
1348+
c_int,
1349+
c_bool,
1350+
c_bool,
1351+
]
1352+
_lib.llama_tokenize.restype = c_int
1353+
1354+
13111355
# // Token Id -> Piece.
13121356
# // Uses the vocabulary in the provided context.
13131357
# // Does not write null terminator to the buffer.

llama_cpp/server/app.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import sys
22
import json
3+
import traceback
34
import multiprocessing
45
import time
56
from re import compile, Match, Pattern
@@ -47,8 +48,8 @@ class Settings(BaseSettings):
4748
)
4849
n_gpu_layers: int = Field(
4950
default=0,
50-
ge=0,
51-
description="The number of layers to put on the GPU. The rest will be on the CPU.",
51+
ge=-1,
52+
description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
5253
)
5354
main_gpu: int = Field(
5455
default=0,
@@ -243,6 +244,7 @@ def error_message_wrapper(
243244
) -> Tuple[int, ErrorResponse]:
244245
"""Wraps error message in OpenAI style error response"""
245246
print(f"Exception: {str(error)}", file=sys.stderr)
247+
traceback.print_exc(file=sys.stderr)
246248
if body is not None and isinstance(
247249
body,
248250
(

tests/test_llama.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,9 @@ def test_llama_cpp_tokenization():
2626
assert detokenized != text
2727

2828

29-
@pytest.mark.skip(reason="bug in tokenization where leading space is always inserted even if not after eos")
3029
def test_llama_patch(monkeypatch):
3130
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
32-
n_vocab = llama_cpp.llama_n_vocab(llama.ctx)
31+
n_vocab = llama_cpp.llama_n_vocab(llama.model)
3332

3433
## Set up mock function
3534
def mock_eval(*args, **kwargs):
@@ -44,7 +43,7 @@ def mock_get_logits(*args, **kwargs):
4443
monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
4544

4645
output_text = " jumps over the lazy dog."
47-
output_tokens = llama.tokenize(output_text.encode("utf-8"))
46+
output_tokens = llama.tokenize(output_text.encode("utf-8"), add_bos=False, special=True)
4847
token_eos = llama.token_eos()
4948
n = 0
5049

@@ -68,9 +67,9 @@ def mock_sample(*args, **kwargs):
6867

6968
## Test streaming completion until eos
7069
n = 0 # reset
71-
chunks = llama.create_completion(text, max_tokens=20, stream=True)
70+
chunks = list(llama.create_completion(text, max_tokens=20, stream=True))
7271
assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == output_text
73-
assert completion["choices"][0]["finish_reason"] == "stop"
72+
assert chunks[-1]["choices"][0]["finish_reason"] == "stop"
7473

7574
## Test basic completion until stop sequence
7675
n = 0 # reset
@@ -80,23 +79,23 @@ def mock_sample(*args, **kwargs):
8079

8180
## Test streaming completion until stop sequence
8281
n = 0 # reset
83-
chunks = llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"])
82+
chunks = list(llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"]))
8483
assert (
8584
"".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps over the "
8685
)
87-
assert completion["choices"][0]["finish_reason"] == "stop"
86+
assert chunks[-1]["choices"][0]["finish_reason"] == "stop"
8887

8988
## Test basic completion until length
9089
n = 0 # reset
9190
completion = llama.create_completion(text, max_tokens=2)
92-
assert completion["choices"][0]["text"] == " j"
91+
assert completion["choices"][0]["text"] == " jumps"
9392
assert completion["choices"][0]["finish_reason"] == "length"
9493

9594
## Test streaming completion until length
9695
n = 0 # reset
97-
chunks = llama.create_completion(text, max_tokens=2, stream=True)
98-
assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == " j"
99-
assert completion["choices"][0]["finish_reason"] == "length"
96+
chunks = list(llama.create_completion(text, max_tokens=2, stream=True))
97+
assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps"
98+
assert chunks[-1]["choices"][0]["finish_reason"] == "length"
10099

101100

102101
def test_llama_pickle():

vendor/llama.cpp

0 commit comments

Comments
 (0)