ASH1998
diff --git a/‎CHANGELOG.md
Lines changed: 6 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 2 additions & 2 deletions b/‎README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎llama_cpp/llama.py
Lines changed: 2 additions & 2 deletions b/‎llama_cpp/llama.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎llama_cpp/llama_cpp.py
Lines changed: 25 additions & 1 deletion b/‎llama_cpp/llama_cpp.py
Lines changed: 25 additions & 1 deletion
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.74]
+
+### Added
+
+- (server) OpenAI style error responses
+
 ## [0.1.73]
 
 ### Added
 
@@ -47,10 +47,10 @@ Otherwise, while installing it will build the llama.ccp x86 version which will b
 `llama.cpp` supports multiple BLAS backends for faster processing.
 Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend.
 
-To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
+To install with OpenBLAS, set the `LLAMA_BLAS and LLAMA_BLAS_VENDOR` environment variables before installing:
 
 ```bash
-CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
+CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 
 To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
 
@@ -850,7 +850,7 @@ def _create_completion(
 
         if len(prompt_tokens) >= llama_cpp.llama_n_ctx(self.ctx):
             raise ValueError(
-                f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
+                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
             )
 
         if max_tokens <= 0:
@@ -958,7 +958,7 @@ def _create_completion(
                     token_end_position += len(self.detokenize([token]))
                     # Check if stop sequence is in the token
                     if token_end_position >= (
-                        remaining_length - first_stop_position - 1
+                        remaining_length - first_stop_position
                     ):
                         break
                     logprobs_or_none: Optional[CompletionLogprobs] = None
 
@@ -175,6 +175,7 @@ class llama_token_data_array(Structure):
 #     // context pointer passed to the progress callback
 #     void * progress_callback_user_data;
 
+
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool low_vram;   // if true, reduce VRAM usage at the cost of performance
 #     bool f16_kv;     // use fp16 for KV cache
@@ -292,6 +293,15 @@ class llama_timings(Structure):
     ]
 
 
+# LLAMA_API int llama_max_devices();
+def llama_max_devices() -> int:
+    return _lib.llama_max_devices()
+
+
+_lib.llama_max_devices.argtypes = []
+_lib.llama_max_devices.restype = c_int
+
+
 # LLAMA_API struct llama_context_params llama_context_default_params();
 def llama_context_default_params() -> llama_context_params:
     return _lib.llama_context_default_params()
@@ -748,7 +758,12 @@ def llama_get_vocab(
     return _lib.llama_get_vocab(ctx, strings, scores, capacity)
 
 
-_lib.llama_get_vocab.argtypes = [llama_context_p, c_char_p, c_float, c_int]
+_lib.llama_get_vocab.argtypes = [
+    llama_context_p,
+    POINTER(c_char_p),
+    POINTER(c_float),
+    c_int,
+]
 _lib.llama_get_vocab.restype = c_int
 
 
@@ -766,6 +781,15 @@ def llama_get_vocab_from_model(
     return _lib.llama_get_vocab_from_model(model, strings, scores, capacity)
 
 
+_lib.llama_get_vocab_from_model.argtypes = [
+    llama_model_p,
+    POINTER(c_char_p),
+    POINTER(c_float),
+    c_int,
+]
+_lib.llama_get_vocab_from_model.restype = c_int
+
+
 # Token logits obtained from the last call to llama_eval()
 # The logits for the last token are stored in the last row
 # Can be mutated in order to change the probabilities of the next token