Skip to content

Commit 4ec3539

Browse files
committed
llava v1.5 integration
1 parent f30aa20 commit 4ec3539

File tree

6 files changed

+149
-1
lines changed

6 files changed

+149
-1
lines changed

examples/multimodal/llava.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import ctypes
2+
import json
3+
import argparse
4+
import os
5+
import array
6+
import sys
7+
8+
from llama_cpp import (Llama, clip_model_load, llava_image_embed_make_with_filename, llava_image_embed_make_with_bytes,
9+
llava_image_embed_p, llava_image_embed_free, llava_validate_embed_size, llava_eval_image_embed)
10+
11+
parser = argparse.ArgumentParser()
12+
parser.add_argument("-m", "--model", type=str, default="../models/llava-v1.5-7b/ggml-model-q5_k.gguf")
13+
parser.add_argument("--mmproj", type=str, default="llava-v1.5-7b/mmproj-model-f16.gguf")
14+
parser.add_argument("-t", "--temp", type=float, default=0.1)
15+
parser.add_argument("-p", "--prompt", type=str, default="Describe this image in detail.")
16+
args = parser.parse_args()
17+
18+
print(f"loading clip model from {args.mmproj}")
19+
if not os.path.exists(args.mmproj):
20+
raise FileNotFoundError(args.mmproj)
21+
ctx_clip = clip_model_load(args.mmproj.encode('utf-8'))
22+
23+
image_path = os.path.join(os.path.dirname(__file__), "overfitting_lc.png")
24+
if not os.path.exists(image_path):
25+
raise FileNotFoundError(image_path)
26+
image_embed = llava_image_embed_make_with_filename(ctx_clip=ctx_clip, n_threads=1, filename=image_path.encode('utf8'))
27+
28+
def load_image_embed_from_file_bytes(image_path: str) -> llava_image_embed_p:
29+
with open(image_path, 'rb') as file:
30+
image_bytes = file.read()
31+
bytes_length = len(image_bytes)
32+
data_array = array.array('B', image_bytes)
33+
c_ubyte_ptr = (ctypes.c_ubyte * len(data_array)).from_buffer(data_array)
34+
return llava_image_embed_make_with_bytes(ctx_clip=ctx_clip, n_threads=1, image_bytes=c_ubyte_ptr, image_bytes_length=bytes_length)
35+
36+
print(f"loading llm model from {args.model}")
37+
if not os.path.exists(args.model):
38+
raise FileNotFoundError(args.model)
39+
llm = Llama(model_path=args.model, n_ctx=2048, n_gpu_layers=1) # longer context needed for image embeds
40+
41+
if not llava_validate_embed_size(llm.ctx, ctx_clip):
42+
raise RuntimeError("llm and mmproj model embed size mismatch")
43+
44+
# eval system prompt
45+
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n"
46+
llm.eval(llm.tokenize(system_prompt.encode('utf8')))
47+
llm.eval(llm.tokenize("\nUSER: ".encode('utf8')))
48+
49+
# eval image embed
50+
n_past = ctypes.c_int(llm.n_tokens)
51+
n_past_p = ctypes.byref(n_past)
52+
llava_eval_image_embed(llm.ctx, image_embed, llm.n_batch, n_past_p)
53+
llm.n_tokens = n_past.value
54+
llava_image_embed_free(image_embed)
55+
56+
# eval prompt
57+
prompt = 'Describe the visual content of this image'
58+
llm.eval(llm.tokenize(prompt.encode('utf8')))
59+
llm.eval(llm.tokenize("\nASSISTANT:".encode('utf8')))
60+
61+
# get output
62+
print("\n")
63+
max_target_len = 256
64+
for i in range(max_target_len):
65+
t_id = llm.sample(temp=0.1)
66+
t = llm.detokenize([t_id]).decode('utf8')
67+
if t == "</s>":
68+
break
69+
print(t, end="")
70+
sys.stdout.flush()
71+
llm.eval([t_id])
72+
73+
print("\n")
74+
print("done")
75+
5.84 KB
Loading

llama_cpp/llama.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,8 @@ def __init__(
242242
lora_base: Optional[str] = None,
243243
lora_scale: float = 1.0,
244244
lora_path: Optional[str] = None,
245+
# Multimodal Params
246+
model_mproj_path: str = None,
245247
# Backend Params
246248
numa: bool = False,
247249
# Chat Format Params

llama_cpp/llama_cpp.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ def _load_shared_library(lib_base_name: str):
8282

8383
# Misc
8484
c_float_p = POINTER(c_float)
85+
c_float_p_p = POINTER(POINTER(c_float))
86+
c_int_p = POINTER(c_int)
8587
c_uint8_p = POINTER(c_uint8)
8688
c_size_t_p = POINTER(c_size_t)
8789

@@ -112,6 +114,11 @@ def _load_shared_library(lib_base_name: str):
112114
# struct llama_context;
113115
llama_context_p = c_void_p
114116

117+
# struct clip_ctx;
118+
clip_ctx_p = c_void_p
119+
120+
# struct llava_image_embed;
121+
llava_image_embed_p = c_void_p;
115122

116123
# typedef int32_t llama_pos;
117124
llama_pos = c_int32
@@ -1923,3 +1930,63 @@ def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p):
19231930

19241931
_lib.llama_dump_timing_info_yaml.argtypes = [ctypes.c_void_p, llama_context_p]
19251932
_lib.llama_dump_timing_info_yaml.restype = None
1933+
1934+
1935+
# LLAVA
1936+
1937+
1938+
# LLAMA_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
1939+
def clip_model_load(fname: Union[c_char_p, bytes], verbosity: c_int = 0) -> clip_ctx_p:
1940+
""" load mmproj model """
1941+
return _lib.clip_model_load(fname, verbosity)
1942+
_lib.clip_model_load.argtypes = [c_char_p, c_int]
1943+
_lib.clip_model_load.restype = clip_ctx_p
1944+
1945+
1946+
# LLAMA_API void clip_free(struct clip_ctx * ctx);
1947+
def clip_free(ctx: clip_ctx_p):
1948+
""" free mmproj model """
1949+
_lib.clip_free(ctx)
1950+
_lib.clip_free.argtypes = [clip_ctx_p]
1951+
_lib.clip_free.restype = None
1952+
1953+
1954+
#LLAMA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
1955+
def llava_validate_embed_size(ctx_llama: llama_context_p, ctx_clip: clip_ctx_p) -> c_bool:
1956+
""" sanity check for clip <-> llava embed size match """
1957+
return _lib.llava_validate_embed_size(ctx_llama, ctx_clip)
1958+
_lib.llava_validate_embed_size.argtypes = [llama_context_p, clip_ctx_p]
1959+
_lib.llava_validate_embed_size.restype = c_bool
1960+
1961+
1962+
#LLAMA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
1963+
def llava_image_embed_make_with_bytes(ctx_clip: clip_ctx_p, n_threads: Union[int,c_int], image_bytes: c_uint8_p, image_bytes_length: c_size_t) -> llava_image_embed_p:
1964+
""" build an image embed by interpreting image_bytes as the contents of an image file with byte size image_bytes_length.
1965+
supported formats (autodetected): JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC (ref https://github.com/nothings/stb) """
1966+
return _lib.llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length)
1967+
_lib.llava_image_embed_make_with_bytes.argtypes = [clip_ctx_p, c_int, c_uint8_p, c_size_t]
1968+
_lib.llava_image_embed_make_with_bytes.restype = llava_image_embed_p
1969+
1970+
1971+
#LLAMA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
1972+
def llava_image_embed_make_with_filename(ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], filename: Union[c_char_p, bytes]) -> llava_image_embed_p:
1973+
""" build an image embed from a path to an image filename """
1974+
return _lib.llava_image_embed_make_with_filename(ctx_clip, n_threads, filename)
1975+
_lib.llava_image_embed_make_with_filename.argtypes = [clip_ctx_p, c_int, c_char_p]
1976+
_lib.llava_image_embed_make_with_filename.restype = llava_image_embed_p
1977+
1978+
#LLAMA_API void llava_image_embed_free(struct llava_image_embed * embed);
1979+
def llava_image_embed_free(embed: llava_image_embed_p):
1980+
""" free an embedding made with one of the llava_image_embed_make_ methods """
1981+
_lib.llava_image_embed_free(embed)
1982+
_lib.llava_image_embed_free.argtypes = [llava_image_embed_p]
1983+
_lib.llava_image_embed_free.restype = None
1984+
1985+
#LLAMA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
1986+
def llava_eval_image_embed(ctx: llama_context_p, image_embed: llava_image_embed_p, n_batch: c_int, n_past: c_int_p) -> c_bool:
1987+
""" write the image represented by embed into the llama context with batch size n_batch,
1988+
starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed."""
1989+
return _lib.llava_eval_image_embed(ctx, image_embed, n_batch, n_past)
1990+
_lib.llava_eval_image_embed.argtypes = [llama_context_p, llava_image_embed_p, c_int, c_int_p]
1991+
_lib.llava_eval_image_embed.restyle = c_bool
1992+

llama_cpp/server/app.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ class Settings(BaseSettings):
4141
default=None,
4242
description="The alias of the model to use for generating completions.",
4343
)
44+
model_mproj: str = Field(
45+
description="For multimodal models (eg Llava), the path to the multimodal projector model."
46+
)
4447
seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.")
4548
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
4649
n_batch: int = Field(
@@ -345,6 +348,7 @@ def create_app(settings: Optional[Settings] = None):
345348
global llama
346349
llama = llama_cpp.Llama(
347350
model_path=settings.model,
351+
model_mproj_path=settings.model_mproj,
348352
seed=settings.seed,
349353
n_ctx=settings.n_ctx,
350354
n_batch=settings.n_batch,

vendor/llama.cpp

0 commit comments

Comments
 (0)