Merge branch 'main' into D4ve-R/main

abetlen · abetlen · commit 3d6c479bd391 · 2023-12-22T00:14:09.000-05:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -1551,7 +1551,9 @@ def logit_bias_processor(
                             "utf-8", errors="ignore"
                         )
                         text_offset = len(prompt) + len(
-                            self.detokenize(completion_tokens[:returned_tokens])
+                            self.detokenize(completion_tokens[:returned_tokens]).decode(
+                                "utf-8", errors="ignore"
+                            )
                         )
                         token_offset = len(prompt_tokens) + returned_tokens
                         logits = self._scores[token_offset - 1, :]
@@ -1789,13 +1791,19 @@ def logit_bias_processor(
             ]
             all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
             # TODO: may be able to change this loop to use np.take_along_dim
-            for token, token_str, logprobs_token in zip(
-                all_tokens, all_token_strs, all_logprobs
+            for idx, (token, token_str, logprobs_token) in enumerate(
+                zip(all_tokens, all_token_strs, all_logprobs)
             ):
                 if token == self.token_bos():
                     continue
-                text_offsets.append(text_offset)
-                text_offset += len(token_str)
+                text_offsets.append(
+                    text_offset
+                    + len(
+                        self.detokenize(all_tokens[:idx]).decode(
+                            "utf-8", errors="ignore"
+                        )
+                    )
+                )
                 tokens.append(token_str)
                 sorted_logprobs = list(
                     sorted(
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -733,8 +733,14 @@ def llama_n_ctx(ctx: llama_context_p) -> int:
 
 
 _lib.llama_n_ctx.argtypes = [llama_context_p]
-_lib.llama_n_ctx.restype = c_int
+_lib.llama_n_ctx.restype = c_uint32
 
+# LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
+def llama_n_batch(ctx: llama_context_p) -> int:
+    return _lib.llama_n_batch(ctx)
+
+_lib.llama_n_batch.argtypes = [llama_context_p]
+_lib.llama_n_batch.restype = c_uint32
 
 # LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
 def llama_vocab_type(model: llama_model_p) -> int:
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 8fe03ffddaaa0ab5d48feaafe398151c9f22d4f6
+Subproject commit afefa319f1f59b002dfa0d1ef407a2c74bd9770b