Skip to content

Commit 0538ba1

Browse files
committed
Merge branch 'main' into v0.2-wip
2 parents 0b121a7 + a4fe3fe commit 0538ba1

File tree

7 files changed

+293
-67
lines changed

7 files changed

+293
-67
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.1.74]
11+
12+
### Added
13+
14+
- (server) OpenAI style error responses
15+
1016
## [0.1.73]
1117

1218
### Added

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,10 @@ Otherwise, while installing it will build the llama.ccp x86 version which will b
4747
`llama.cpp` supports multiple BLAS backends for faster processing.
4848
Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend.
4949

50-
To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
50+
To install with OpenBLAS, set the `LLAMA_BLAS and LLAMA_BLAS_VENDOR` environment variables before installing:
5151

5252
```bash
53-
CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
53+
CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" FORCE_CMAKE=1 pip install llama-cpp-python
5454
```
5555

5656
To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:

llama_cpp/llama.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -850,7 +850,7 @@ def _create_completion(
850850

851851
if len(prompt_tokens) >= llama_cpp.llama_n_ctx(self.ctx):
852852
raise ValueError(
853-
f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
853+
f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
854854
)
855855

856856
if max_tokens <= 0:
@@ -958,7 +958,7 @@ def _create_completion(
958958
token_end_position += len(self.detokenize([token]))
959959
# Check if stop sequence is in the token
960960
if token_end_position >= (
961-
remaining_length - first_stop_position - 1
961+
remaining_length - first_stop_position
962962
):
963963
break
964964
logprobs_or_none: Optional[CompletionLogprobs] = None

llama_cpp/llama_cpp.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ class llama_token_data_array(Structure):
175175
# // context pointer passed to the progress callback
176176
# void * progress_callback_user_data;
177177

178+
178179
# // Keep the booleans together to avoid misalignment during copy-by-value.
179180
# bool low_vram; // if true, reduce VRAM usage at the cost of performance
180181
# bool f16_kv; // use fp16 for KV cache
@@ -292,6 +293,15 @@ class llama_timings(Structure):
292293
]
293294

294295

296+
# LLAMA_API int llama_max_devices();
297+
def llama_max_devices() -> int:
298+
return _lib.llama_max_devices()
299+
300+
301+
_lib.llama_max_devices.argtypes = []
302+
_lib.llama_max_devices.restype = c_int
303+
304+
295305
# LLAMA_API struct llama_context_params llama_context_default_params();
296306
def llama_context_default_params() -> llama_context_params:
297307
return _lib.llama_context_default_params()
@@ -748,7 +758,12 @@ def llama_get_vocab(
748758
return _lib.llama_get_vocab(ctx, strings, scores, capacity)
749759

750760

751-
_lib.llama_get_vocab.argtypes = [llama_context_p, c_char_p, c_float, c_int]
761+
_lib.llama_get_vocab.argtypes = [
762+
llama_context_p,
763+
POINTER(c_char_p),
764+
POINTER(c_float),
765+
c_int,
766+
]
752767
_lib.llama_get_vocab.restype = c_int
753768

754769

@@ -766,6 +781,15 @@ def llama_get_vocab_from_model(
766781
return _lib.llama_get_vocab_from_model(model, strings, scores, capacity)
767782

768783

784+
_lib.llama_get_vocab_from_model.argtypes = [
785+
llama_model_p,
786+
POINTER(c_char_p),
787+
POINTER(c_float),
788+
c_int,
789+
]
790+
_lib.llama_get_vocab_from_model.restype = c_int
791+
792+
769793
# Token logits obtained from the last call to llama_eval()
770794
# The logits for the last token are stored in the last row
771795
# Can be mutated in order to change the probabilities of the next token

0 commit comments

Comments
 (0)