llava v1.5 integration

damian0815 · damian0815 · commit 4ec35390d72f · 2023-10-15T19:35:23.000+02:00
diff --git a/examples/multimodal/llava.py b/examples/multimodal/llava.py
@@ -0,0 +1,75 @@
+import ctypes
+import json
+import argparse
+import os
+import array
+import sys
+
+from llama_cpp import (Llama, clip_model_load, llava_image_embed_make_with_filename, llava_image_embed_make_with_bytes,
+    llava_image_embed_p, llava_image_embed_free, llava_validate_embed_size, llava_eval_image_embed)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-m", "--model", type=str, default="../models/llava-v1.5-7b/ggml-model-q5_k.gguf")
+parser.add_argument("--mmproj", type=str, default="llava-v1.5-7b/mmproj-model-f16.gguf")
+parser.add_argument("-t", "--temp", type=float, default=0.1)
+parser.add_argument("-p", "--prompt", type=str, default="Describe this image in detail.")
+args = parser.parse_args()
+
+print(f"loading clip model from {args.mmproj}")
+if not os.path.exists(args.mmproj):
+    raise FileNotFoundError(args.mmproj)
+ctx_clip = clip_model_load(args.mmproj.encode('utf-8'))
+
+image_path = os.path.join(os.path.dirname(__file__), "overfitting_lc.png")
+if not os.path.exists(image_path):
+    raise FileNotFoundError(image_path)
+image_embed = llava_image_embed_make_with_filename(ctx_clip=ctx_clip, n_threads=1, filename=image_path.encode('utf8'))
+
+def load_image_embed_from_file_bytes(image_path: str) -> llava_image_embed_p:
+    with open(image_path, 'rb') as file:
+        image_bytes = file.read()
+        bytes_length = len(image_bytes)
+        data_array = array.array('B', image_bytes)
+        c_ubyte_ptr = (ctypes.c_ubyte * len(data_array)).from_buffer(data_array)
+        return llava_image_embed_make_with_bytes(ctx_clip=ctx_clip, n_threads=1, image_bytes=c_ubyte_ptr, image_bytes_length=bytes_length)
+
+print(f"loading llm model from {args.model}")
+if not os.path.exists(args.model):
+    raise FileNotFoundError(args.model)
+llm = Llama(model_path=args.model, n_ctx=2048, n_gpu_layers=1) # longer context needed for image embeds
+
+if not llava_validate_embed_size(llm.ctx, ctx_clip):
+    raise RuntimeError("llm and mmproj model embed size mismatch")
+
+# eval system prompt
+system_prompt = "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\n"
+llm.eval(llm.tokenize(system_prompt.encode('utf8')))
+llm.eval(llm.tokenize("\nUSER: ".encode('utf8')))
+
+# eval image embed
+n_past = ctypes.c_int(llm.n_tokens)
+n_past_p = ctypes.byref(n_past)
+llava_eval_image_embed(llm.ctx, image_embed, llm.n_batch, n_past_p)
+llm.n_tokens = n_past.value
+llava_image_embed_free(image_embed)
+
+# eval prompt
+prompt = 'Describe the visual content of this image'
+llm.eval(llm.tokenize(prompt.encode('utf8')))
+llm.eval(llm.tokenize("\nASSISTANT:".encode('utf8')))
+
+# get output
+print("\n")
+max_target_len = 256
+for i in range(max_target_len):
+    t_id = llm.sample(temp=0.1)
+    t = llm.detokenize([t_id]).decode('utf8')
+    if t == "</s>":
+        break
+    print(t, end="")
+    sys.stdout.flush()
+    llm.eval([t_id])
+
+print("\n")
+print("done")
+
diff --git a/examples/multimodal/overfitting_lc.png b/examples/multimodal/overfitting_lc.png
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -242,6 +242,8 @@ def __init__(
         lora_base: Optional[str] = None,
         lora_scale: float = 1.0,
         lora_path: Optional[str] = None,
+        # Multimodal Params
+        model_mproj_path: str = None,
         # Backend Params
         numa: bool = False,
         # Chat Format Params
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -82,6 +82,8 @@ def _load_shared_library(lib_base_name: str):
 
 # Misc
 c_float_p = POINTER(c_float)
+c_float_p_p = POINTER(POINTER(c_float))
+c_int_p = POINTER(c_int)
 c_uint8_p = POINTER(c_uint8)
 c_size_t_p = POINTER(c_size_t)
 
@@ -112,6 +114,11 @@ def _load_shared_library(lib_base_name: str):
 # struct llama_context;
 llama_context_p = c_void_p
 
+# struct clip_ctx;
+clip_ctx_p = c_void_p
+
+# struct llava_image_embed;
+llava_image_embed_p = c_void_p;
 
 # typedef int32_t llama_pos;
 llama_pos = c_int32
@@ -1923,3 +1930,63 @@ def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p):
 
 _lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p]
 _lib.llama_dump_timing_info_yaml.restype = None
+
+
+# LLAVA
+
+
+# LLAMA_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
+def clip_model_load(fname: Union[c_char_p, bytes], verbosity: c_int = 0) -> clip_ctx_p:
+    """ load mmproj model """
+    return _lib.clip_model_load(fname, verbosity)
+_lib.clip_model_load.argtypes = [c_char_p, c_int]
+_lib.clip_model_load.restype = clip_ctx_p
+
+
+# LLAMA_API void clip_free(struct clip_ctx * ctx);
+def clip_free(ctx: clip_ctx_p):
+    """ free mmproj model """
+    _lib.clip_free(ctx)
+_lib.clip_free.argtypes = [clip_ctx_p]
+_lib.clip_free.restype = None
+
+
+#LLAMA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
+def llava_validate_embed_size(ctx_llama: llama_context_p, ctx_clip: clip_ctx_p) -> c_bool:
+    """ sanity check for clip <-> llava embed size match """
+    return _lib.llava_validate_embed_size(ctx_llama, ctx_clip)
+_lib.llava_validate_embed_size.argtypes = [llama_context_p, clip_ctx_p]
+_lib.llava_validate_embed_size.restype = c_bool
+
+
+#LLAMA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
+def llava_image_embed_make_with_bytes(ctx_clip: clip_ctx_p, n_threads: Union[int,c_int], image_bytes: c_uint8_p, image_bytes_length: c_size_t) -> llava_image_embed_p:
+    """ build an image embed by interpreting image_bytes as the contents of an image file with byte size image_bytes_length.
+     supported formats (autodetected): JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC (ref https://github.com/nothings/stb) """
+    return _lib.llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length)
+_lib.llava_image_embed_make_with_bytes.argtypes = [clip_ctx_p, c_int, c_uint8_p, c_size_t]
+_lib.llava_image_embed_make_with_bytes.restype = llava_image_embed_p
+
+
+#LLAMA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
+def llava_image_embed_make_with_filename(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], filename: Union[c_char_p, bytes]) -> llava_image_embed_p:
+    """ build an image embed from a path to an image filename """
+    return _lib.llava_image_embed_make_with_filename(ctx_clip, n_threads, filename)
+_lib.llava_image_embed_make_with_filename.argtypes = [clip_ctx_p, c_int, c_char_p]
+_lib.llava_image_embed_make_with_filename.restype = llava_image_embed_p
+
+#LLAMA_API void llava_image_embed_free(struct llava_image_embed * embed);
+def llava_image_embed_free(embed: llava_image_embed_p):
+    """ free an embedding made with one of the llava_image_embed_make_ methods """
+    _lib.llava_image_embed_free(embed)
+_lib.llava_image_embed_free.argtypes = [llava_image_embed_p]
+_lib.llava_image_embed_free.restype = None
+
+#LLAMA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
+def llava_eval_image_embed(ctx: llama_context_p, image_embed: llava_image_embed_p, n_batch: c_int, n_past: c_int_p) -> c_bool:
+    """ write the image represented by embed into the llama context with batch size n_batch,
+    starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed."""
+    return _lib.llava_eval_image_embed(ctx, image_embed, n_batch, n_past)
+_lib.llava_eval_image_embed.argtypes = [llama_context_p, llava_image_embed_p, c_int, c_int_p]
+_lib.llava_eval_image_embed.restyle = c_bool
+
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
@@ -41,6 +41,9 @@ class Settings(BaseSettings):
         default=None,
         description="The alias of the model to use for generating completions.",
     )
+    model_mproj: str = Field(
+        description="For multimodal models (eg Llava), the path to the multimodal projector model."
+    )
     seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.")
     n_ctx: int = Field(default=2048, ge=1, description="The context size.")
     n_batch: int = Field(
@@ -345,6 +348,7 @@ def create_app(settings: Optional[Settings] = None):
     global llama
     llama = llama_cpp.Llama(
         model_path=settings.model,
+        model_mproj_path=settings.model_mproj,
         seed=settings.seed,
         n_ctx=settings.n_ctx,
         n_batch=settings.n_batch,
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit b8fe4b5cc9cb237ca98e5bc51b5d189e3c446d13
+Subproject commit 5a9155189945cd9aa6b98a4a340b38dc93c8d219