truncate to n_batch, not n_ctx

iamlemec · iamlemec · commit fa7f1cd45e80 · 2024-02-15T11:43:53.000-06:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -762,7 +762,7 @@ def embed(
         """
         assert self._ctx.ctx is not None
         n_embd = self.n_embd()
-        n_ctx = self.n_ctx()
+        n_batch = self.n_batch
 
         if self.context_params.embedding == False:
             raise RuntimeError(
@@ -807,19 +807,19 @@ def decode_batch(n_seq: int):
         for text in inputs:
             tokens = self.tokenize(text.encode("utf-8"))
             if truncate:
-                tokens = tokens[:n_ctx]
+                tokens = tokens[:n_batch]
 
             n_tokens = len(tokens)
             total_tokens += n_tokens
 
             # check for overrun
-            if n_tokens > n_ctx:
+            if n_tokens > n_batch:
                 raise ValueError(
-                    f"Requested tokens ({n_tokens}) exceed context window of {n_ctx}"
+                    f"Requested tokens ({n_tokens}) exceed batch size of {n_batch}"
                 )
 
             # time to eval batch
-            if t_batch + n_tokens > self._n_ctx:
+            if t_batch + n_tokens > n_batch:
                 decode_batch(p_batch)
                 t_batch = 0
                 p_batch = 0