Merge branch 'main' into add-functionary-support

abetlen · abetlen · commit ba3496cbce32 · 2023-10-20T13:22:44.000-04:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -308,6 +308,8 @@ def __init__(
         self.tensor_split = tensor_split
         self._p_tensor_split = None
         if self.tensor_split is not None:
+            if len(self.tensor_split) > llama_cpp.LLAMA_MAX_DEVICES:
+                raise ValueError(f"Attempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES={llama_cpp.LLAMA_MAX_DEVICES}")
             # Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
             FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES
             self._c_tensor_split = FloatArray(
@@ -442,7 +444,7 @@ def eval_logits(self) -> Deque[List[float]]:
             maxlen=self._n_ctx if self.context_params.logits_all else 1,
         )
 
-    def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
+    def tokenize(self, text: bytes, add_bos: bool = True, special: bool = False) -> List[int]:
         """Tokenize a string.
 
         Args:
@@ -464,6 +466,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
             tokens,
             n_ctx,
             add_bos,
+            special
         )
         if n_tokens < 0:
             n_tokens = abs(n_tokens)
@@ -475,6 +478,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
                 tokens,
                 n_tokens,
                 add_bos,
+                special
             )
             if n_tokens < 0:
                 raise RuntimeError(
@@ -1228,20 +1232,6 @@ def _create_completion(
                             }
                         ],
                     }
-                    yield {
-                        "id": completion_id,
-                        "object": "text_completion",
-                        "created": created,
-                        "model": model_name,
-                        "choices": [
-                            {
-                                "text": "",
-                                "index": 0,
-                                "logprobs": None,
-                                "finish_reason": finish_reason,
-                            }
-                        ],
-                    }
                     break
                 returned_tokens += 1
                 yield {
@@ -1260,20 +1250,20 @@ def _create_completion(
                         }
                     ],
                 }
-                yield {
-                    "id": completion_id,
-                    "object": "text_completion",
-                    "created": created,
-                    "model": model_name,
-                    "choices": [
-                        {
-                            "text": "",
-                            "index": 0,
-                            "logprobs": None,
-                            "finish_reason": finish_reason,
-                        }
-                    ],
-                }
+            yield {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": created,
+                "model": model_name,
+                "choices": [
+                    {
+                        "text": "",
+                        "index": 0,
+                        "logprobs": None,
+                        "finish_reason": finish_reason,
+                    }
+                ],
+            }
             if self.cache:
                 if self.verbose:
                     print("Llama._create_completion: cache save", file=sys.stderr)
@@ -1573,14 +1563,21 @@ def create_chat_completion(
             grammar=grammar,
         )
 
-    def __del__(self):
+    def _free_model(self):
         if hasattr(self, "model") and self.model is not None:
             llama_cpp.llama_free_model(self.model)
             self.model = None
         if hasattr(self, "ctx") and self.ctx is not None:
             llama_cpp.llama_free(self.ctx)
             self.ctx = None
 
+    def __del__(self):
+        if self.verbose:
+            self._free_model()
+        else:
+            with suppress_stdout_stderr():
+                self._free_model()
+
     def __getstate__(self):
         return dict(
             model_path=self.model_path,
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -79,7 +79,7 @@ def _format_llama2(
     ret = system_message + sep
     for role, message in messages:
         if message:
-            ret += message + " "
+            ret += role + message + " "
         else:
             ret += role + " "
     return ret
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -240,11 +240,11 @@ class llama_token_data_array(Structure):
 # typedef struct llama_batch {
 #     int32_t n_tokens;
 
-#     llama_token  * token;
-#     float        * embd;
-#     llama_pos    * pos;
-#     llama_seq_id * seq_id;
-#     int8_t       * logits;
+#     llama_token  *  token;
+#     float        *  embd;
+#     llama_pos    *  pos;
+#     llama_seq_id ** seq_id;
+#     int8_t       *  logits;
 
 
 #     // NOTE: helpers for smooth API transition - can be deprecated in the future
@@ -262,7 +262,7 @@ class llama_batch(Structure):
         ("token", POINTER(llama_token)),
         ("embd", c_float_p),
         ("pos", POINTER(llama_pos)),
-        ("seq_id", POINTER(llama_seq_id)),
+        ("seq_id", POINTER(POINTER(llama_seq_id))),
         ("logits", POINTER(c_int8)),
         ("all_pos_0", llama_pos),
         ("all_pos_1", llama_pos),
@@ -1069,22 +1069,26 @@ def llama_batch_get_one(
 _lib.llama_batch_get_one.restype = llama_batch
 
 
-# // Allocates a batch of tokens on the heap
+# // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
+# // Each token can be assigned up to n_seq_max sequence ids
 # // The batch has to be freed with llama_batch_free()
 # // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
 # // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
 # // The rest of the llama_batch members are allocated with size n_tokens
 # // All members are left uninitialized
 # LLAMA_API struct llama_batch llama_batch_init(
 #         int32_t n_tokens,
-#         int32_t embd);
+#         int32_t embd,
+#         int32_t n_seq_max);
 def llama_batch_init(
-    n_tokens: Union[c_int, int], embd: Union[c_int, int]
+    n_tokens: Union[c_int32, int],
+    embd: Union[c_int32, int],
+    n_seq_max: Union[c_int32, int],
 ) -> llama_batch:
-    return _lib.llama_batch_init(n_tokens, embd)
+    return _lib.llama_batch_init(n_tokens, embd, n_seq_max)
 
 
-_lib.llama_batch_init.argtypes = [c_int, c_int]
+_lib.llama_batch_init.argtypes = [c_int32, c_int32, c_int32]
 _lib.llama_batch_init.restype = llama_batch
 
 
@@ -1308,6 +1312,46 @@ def llama_tokenize(
 _lib.llama_tokenize.restype = c_int
 
 
+# /// @details Convert the provided text into tokens.
+# /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
+# /// @return Returns the number of tokens on success, no more than n_max_tokens
+# /// @return Returns a negative number on failure - the number of tokens that would have been returned
+# /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
+# ///                Does not insert a leading space.
+# LLAMA_API int llama_tokenize(
+#     const struct llama_model * model,
+#                   const char * text,
+#                          int   text_len,
+#                  llama_token * tokens,
+#                          int   n_max_tokens,
+#                         bool   add_bos,
+#                         bool   special);
+def llama_tokenize(
+    model: llama_model_p,
+    text: bytes,
+    text_len: Union[c_int, int],
+    tokens,  # type: Array[llama_token]
+    n_max_tokens: Union[c_int, int],
+    add_bos: Union[c_bool, bool],
+    special: Union[c_bool, bool],
+) -> int:
+    return _lib.llama_tokenize(
+        model, text, text_len, tokens, n_max_tokens, add_bos, special
+    )
+
+
+_lib.llama_tokenize.argtypes = [
+    llama_model_p,
+    c_char_p,
+    c_int,
+    llama_token_p,
+    c_int,
+    c_bool,
+    c_bool,
+]
+_lib.llama_tokenize.restype = c_int
+
+
 # // Token Id -> Piece.
 # // Uses the vocabulary in the provided context.
 # // Does not write null terminator to the buffer.
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
@@ -1,5 +1,6 @@
 import sys
 import json
+import traceback
 import multiprocessing
 import time
 from re import compile, Match, Pattern
@@ -47,8 +48,8 @@ class Settings(BaseSettings):
     )
     n_gpu_layers: int = Field(
         default=0,
-        ge=0,
-        description="The number of layers to put on the GPU. The rest will be on the CPU.",
+        ge=-1,
+        description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
     )
     main_gpu: int = Field(
         default=0,
@@ -243,6 +244,7 @@ def error_message_wrapper(
     ) -> Tuple[int, ErrorResponse]:
         """Wraps error message in OpenAI style error response"""
         print(f"Exception: {str(error)}", file=sys.stderr)
+        traceback.print_exc(file=sys.stderr)
         if body is not None and isinstance(
             body,
             (
diff --git a/tests/test_llama.py b/tests/test_llama.py
@@ -26,10 +26,9 @@ def test_llama_cpp_tokenization():
     assert detokenized != text
 
 
-@pytest.mark.skip(reason="bug in tokenization where leading space is always inserted even if not after eos")
 def test_llama_patch(monkeypatch):
     llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
-    n_vocab = llama_cpp.llama_n_vocab(llama.ctx)
+    n_vocab = llama_cpp.llama_n_vocab(llama.model)
 
     ## Set up mock function
     def mock_eval(*args, **kwargs):
@@ -44,7 +43,7 @@ def mock_get_logits(*args, **kwargs):
     monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
 
     output_text = " jumps over the lazy dog."
-    output_tokens = llama.tokenize(output_text.encode("utf-8"))
+    output_tokens = llama.tokenize(output_text.encode("utf-8"), add_bos=False, special=True)
     token_eos = llama.token_eos()
     n = 0
 
@@ -68,9 +67,9 @@ def mock_sample(*args, **kwargs):
 
     ## Test streaming completion until eos
     n = 0  # reset
-    chunks = llama.create_completion(text, max_tokens=20, stream=True)
+    chunks = list(llama.create_completion(text, max_tokens=20, stream=True))
     assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == output_text
-    assert completion["choices"][0]["finish_reason"] == "stop"
+    assert chunks[-1]["choices"][0]["finish_reason"] == "stop"
 
     ## Test basic completion until stop sequence
     n = 0  # reset
@@ -80,23 +79,23 @@ def mock_sample(*args, **kwargs):
 
     ## Test streaming completion until stop sequence
     n = 0  # reset
-    chunks = llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"])
+    chunks = list(llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"]))
     assert (
         "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps over the "
     )
-    assert completion["choices"][0]["finish_reason"] == "stop"
+    assert chunks[-1]["choices"][0]["finish_reason"] == "stop"
 
     ## Test basic completion until length
     n = 0  # reset
     completion = llama.create_completion(text, max_tokens=2)
-    assert completion["choices"][0]["text"] == " j"
+    assert completion["choices"][0]["text"] == " jumps"
     assert completion["choices"][0]["finish_reason"] == "length"
 
     ## Test streaming completion until length
     n = 0  # reset
-    chunks = llama.create_completion(text, max_tokens=2, stream=True)
-    assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == " j"
-    assert completion["choices"][0]["finish_reason"] == "length"
+    chunks = list(llama.create_completion(text, max_tokens=2, stream=True))
+    assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps"
+    assert chunks[-1]["choices"][0]["finish_reason"] == "length"
 
 
 def test_llama_pickle():
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 48edda30ee545fdac2e7a33d505382888f748bbf
+Subproject commit 8cf19d60dc93809db8e51fedc811595eed9134c5