Skip to content

Commit 9c41a3e

Browse files
committed
Merge branch 'main' of github.com:abetlen/llama_cpp_python into main
2 parents f27393a + f568bae commit 9c41a3e

File tree

4 files changed

+119
-3
lines changed

4 files changed

+119
-3
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ This package provides:
1717

1818
Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python).
1919

20+
Detailed MacOS Metal GPU install documentation is available at [docs/macos_install.md](docs/macos_install.md)
21+
22+
2023
## Installation from PyPI (recommended)
2124

2225
Install from PyPI (requires a c compiler):
@@ -25,7 +28,7 @@ Install from PyPI (requires a c compiler):
2528
pip install llama-cpp-python
2629
```
2730

28-
The above command will attempt to install the package and build build `llama.cpp` from source.
31+
The above command will attempt to install the package and build `llama.cpp` from source.
2932
This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system.
3033

3134
If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different compiler options, please add the following flags to ensure that the package is rebuilt correctly:

docs/macos_install.md

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
2+
# llama-cpp-python - MacOS Install with Metal GPU
3+
4+
5+
**(1) Make sure you have xcode installed... at least the command line parts**
6+
```
7+
# check the path of your xcode install
8+
xcode-select -p
9+
10+
# xcode installed returns
11+
# /Applications/Xcode-beta.app/Contents/Developer
12+
13+
# if xcode is missing then install it... it takes ages;
14+
xcode-select --install
15+
```
16+
17+
**(2) Install the conda version for MacOS that supports Metal GPU**
18+
```
19+
wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
20+
bash Miniforge3-MacOSX-arm64.sh
21+
```
22+
23+
**(3) Make a conda environment**
24+
```
25+
conda create -n llama python=3.9.16
26+
conda activate llama
27+
```
28+
29+
**(4) Install the LATEST llama-cpp-python.. which, as of just today, happily supports MacOS Metal GPU**
30+
*(you needed xcode installed in order pip to build/compile the C++ code)*
31+
```
32+
pip uninstall llama-cpp-python -y
33+
CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
34+
pip install 'llama-cpp-python[server]'
35+
36+
# you should now have llama-cpp-python v0.1.62 installed
37+
llama-cpp-python         0.1.62     
38+
39+
```
40+
41+
**(4) Download a v3 ggml llama/vicuna/alpaca model**
42+
- **ggmlv3**
43+
- file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0
44+
45+
https://huggingface.co/vicuna/ggml-vicuna-13b-1.1/blob/main/ggml-vic13b-q4_0.bin
46+
https://huggingface.co/vicuna/ggml-vicuna-13b-1.1/blob/main/ggml-vic13b-uncensored-q4_0.bin
47+
https://huggingface.co/TheBloke/LLaMa-7B-GGML/blob/main/llama-7b.ggmlv3.q4_0.bin
48+
https://huggingface.co/TheBloke/LLaMa-13B-GGML/blob/main/llama-13b.ggmlv3.q4_0.bin
49+
50+
51+
**(6) run the llama-cpp-python API server with MacOS Metal GPU support**
52+
```
53+
# config your ggml model path
54+
# make sure it is ggml v3
55+
# make sure it is q4_0
56+
export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]q4_0.bin
57+
python3 -m llama_cpp.server --model $MODEL --n_gpu_layers 1
58+
```
59+
60+
***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used*
61+
62+

llama_cpp/llama.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1378,6 +1378,7 @@ def create_chat_completion(
13781378
mirostat_tau: float = 5.0,
13791379
mirostat_eta: float = 0.1,
13801380
model: Optional[str] = None,
1381+
logits_processor: Optional[LogitsProcessorList] = None,
13811382
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
13821383
"""Generate a chat completion from a list of messages.
13831384
@@ -1419,6 +1420,7 @@ def create_chat_completion(
14191420
mirostat_tau=mirostat_tau,
14201421
mirostat_eta=mirostat_eta,
14211422
model=model,
1423+
logits_processor=logits_processor,
14221424
)
14231425
if stream:
14241426
chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore

llama_cpp/server/app.py

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,13 +259,14 @@ class CreateCompletionRequest(BaseModel):
259259
)
260260
presence_penalty: Optional[float] = presence_penalty_field
261261
frequency_penalty: Optional[float] = frequency_penalty_field
262+
logit_bias: Optional[Dict[str, float]] = Field(None)
263+
logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
262264

263265
# ignored or currently unsupported
264266
model: Optional[str] = model_field
265267
n: Optional[int] = 1
266268
logprobs: Optional[int] = Field(None)
267269
best_of: Optional[int] = 1
268-
logit_bias: Optional[Dict[str, float]] = Field(None)
269270
user: Optional[str] = Field(None)
270271

271272
# llama.cpp specific parameters
@@ -284,6 +285,39 @@ class Config:
284285
CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
285286

286287

288+
def make_logit_bias_processor(
289+
llama: llama_cpp.Llama,
290+
logit_bias: Dict[str, float],
291+
logit_bias_type: Optional[Literal["input_ids", "tokens"]],
292+
):
293+
if logit_bias_type is None:
294+
logit_bias_type = "input_ids"
295+
296+
to_bias: Dict[int, float] = {}
297+
if logit_bias_type == "input_ids":
298+
for input_id, score in logit_bias.items():
299+
input_id = int(input_id)
300+
to_bias[input_id] = score
301+
302+
elif logit_bias_type == "tokens":
303+
for token, score in logit_bias.items():
304+
token = token.encode('utf-8')
305+
for input_id in llama.tokenize(token, add_bos=False):
306+
to_bias[input_id] = score
307+
308+
def logit_bias_processor(
309+
input_ids: List[int],
310+
scores: List[float],
311+
) -> List[float]:
312+
new_scores = [None] * len(scores)
313+
for input_id, score in enumerate(scores):
314+
new_scores[input_id] = score + to_bias.get(input_id, 0.0)
315+
316+
return new_scores
317+
318+
return logit_bias_processor
319+
320+
287321
@router.post(
288322
"/v1/completions",
289323
response_model=CreateCompletionResponse,
@@ -301,9 +335,16 @@ async def create_completion(
301335
"n",
302336
"best_of",
303337
"logit_bias",
338+
"logit_bias_type",
304339
"user",
305340
}
306341
kwargs = body.dict(exclude=exclude)
342+
343+
if body.logit_bias is not None:
344+
kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
345+
make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
346+
])
347+
307348
if body.stream:
308349
send_chan, recv_chan = anyio.create_memory_object_stream(10)
309350

@@ -382,11 +423,12 @@ class CreateChatCompletionRequest(BaseModel):
382423
stream: bool = stream_field
383424
presence_penalty: Optional[float] = presence_penalty_field
384425
frequency_penalty: Optional[float] = frequency_penalty_field
426+
logit_bias: Optional[Dict[str, float]] = Field(None)
427+
logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
385428

386429
# ignored or currently unsupported
387430
model: Optional[str] = model_field
388431
n: Optional[int] = 1
389-
logit_bias: Optional[Dict[str, float]] = Field(None)
390432
user: Optional[str] = Field(None)
391433

392434
# llama.cpp specific parameters
@@ -423,9 +465,16 @@ async def create_chat_completion(
423465
exclude = {
424466
"n",
425467
"logit_bias",
468+
"logit_bias_type",
426469
"user",
427470
}
428471
kwargs = body.dict(exclude=exclude)
472+
473+
if body.logit_bias is not None:
474+
kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
475+
make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
476+
])
477+
429478
if body.stream:
430479
send_chan, recv_chan = anyio.create_memory_object_stream(10)
431480

0 commit comments

Comments
 (0)