Skip to content

Commit 19e3a54

Browse files
authored
Merge branch 'main' into docker
2 parents ed15d2e + 04959f1 commit 19e3a54

File tree

11 files changed

+41
-23
lines changed

11 files changed

+41
-23
lines changed

.github/workflows/build-and-release.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
python -m pip install -e .[all]
3030
3131
- name: Build wheels
32-
uses: pypa/cibuildwheel@v2.19.0
32+
uses: pypa/cibuildwheel@v2.19.1
3333
env:
3434
# disable repair
3535
CIBW_REPAIR_WHEEL_COMMAND: ""
@@ -56,7 +56,7 @@ jobs:
5656
platforms: linux/arm64
5757

5858
- name: Build wheels
59-
uses: pypa/cibuildwheel@v2.19.0
59+
uses: pypa/cibuildwheel@v2.19.1
6060
env:
6161
CIBW_SKIP: "*musllinux* pp*"
6262
CIBW_REPAIR_WHEEL_COMMAND: ""

.github/workflows/build-docker.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ jobs:
3737

3838
- name: Build and push
3939
id: docker_build
40-
uses: docker/build-push-action@v5
40+
uses: docker/build-push-action@v6
4141
with:
4242
context: .
4343
file: "docker/simple/Dockerfile"

.github/workflows/build-wheels-metal.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ jobs:
3030
python -m pip install -e .[all]
3131
3232
- name: Build wheels
33-
uses: pypa/cibuildwheel@v2.18.1
33+
uses: pypa/cibuildwheel@v2.19.1
3434
env:
3535
# disable repair
3636
CIBW_REPAIR_WHEEL_COMMAND: ""

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.2.79]
11+
12+
- feat: Update llama.cpp to ggerganov/llama.cpp@9c77ec1d74874ee22bdef8f110e8e8d41389abf2
13+
- feat(ci): Update workflows and pre-built wheels by @Smartappli in #1416
14+
- feat: Add .close() method to Llama class to explicitly free model from memory by @jkawamoto in #1513
15+
- feat: Support SPM infill by @CISC in #1492
16+
1017
## [0.2.78]
1118

1219
- feat: Update llama.cpp to ggerganov/llama.cpp@fd5ea0f897ecb3659d6c269ef6f3d833e865ead7

Makefile

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,6 @@ build.debug:
2424
build.cuda:
2525
CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .
2626

27-
build.opencl:
28-
CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
29-
3027
build.openblas:
3128
CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e .
3229

README.md

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -165,17 +165,6 @@ pip install llama-cpp-python \
165165
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
166166
```
167167

168-
</details>
169-
<details>
170-
171-
<summary>CLBlast (OpenCL)</summary>
172-
173-
To install with CLBlast, set the `LLAMA_CLBLAST=on` environment variable before installing:
174-
175-
```bash
176-
CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
177-
```
178-
179168
</details>
180169

181170
<details>
@@ -338,7 +327,7 @@ You'll need to install the `huggingface-hub` package to use this feature (`pip i
338327

339328
```python
340329
llm = Llama.from_pretrained(
341-
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
330+
repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
342331
filename="*q8_0.gguf",
343332
verbose=False
344333
)
@@ -699,7 +688,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_
699688
If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub.
700689

701690
```bash
702-
python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen1.5-0.5B-Chat-GGUF --model '*q8_0.gguf'
691+
python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf'
703692
```
704693

705694
### Web Server Features

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.78"
4+
__version__ = "0.2.79"

llama_cpp/_internals.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ def free_model():
6464
def close(self):
6565
self._exit_stack.close()
6666

67+
def __del__(self):
68+
self.close()
69+
6770
def vocab_type(self) -> int:
6871
assert self.model is not None
6972
return llama_cpp.llama_vocab_type(self.model)
@@ -292,6 +295,9 @@ def free_ctx():
292295
def close(self):
293296
self._exit_stack.close()
294297

298+
def __del__(self):
299+
self.close()
300+
295301
def n_ctx(self) -> int:
296302
assert self.ctx is not None
297303
return llama_cpp.llama_n_ctx(self.ctx)
@@ -531,6 +537,9 @@ def free_batch():
531537
def close(self):
532538
self._exit_stack.close()
533539

540+
def __del__(self):
541+
self.close()
542+
534543
def n_tokens(self) -> int:
535544
assert self.batch is not None
536545
return self.batch.n_tokens

llama_cpp/llama.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1968,6 +1968,9 @@ def close(self) -> None:
19681968
"""Explicitly free the model from memory."""
19691969
self._stack.close()
19701970

1971+
def __del__(self) -> None:
1972+
self.close()
1973+
19711974
@staticmethod
19721975
def logits_to_logprobs(
19731976
logits: Union[npt.NDArray[np.single], List], axis: int = -1

llama_cpp/llama_cpp.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
301301
# LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
302302
# LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
303303
# LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
304+
# LLAMA_VOCAB_PRE_TYPE_PORO = 15,
304305
# };
305306
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
306307
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -317,6 +318,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
317318
LLAMA_VOCAB_PRE_TYPE_OLMO = 12
318319
LLAMA_VOCAB_PRE_TYPE_DBRX = 13
319320
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
321+
LLAMA_VOCAB_PRE_TYPE_PORO = 15
320322

321323

322324
# // note: these values should be synchronized with ggml_rope
@@ -466,11 +468,13 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
466468
# LLAMA_POOLING_TYPE_NONE = 0,
467469
# LLAMA_POOLING_TYPE_MEAN = 1,
468470
# LLAMA_POOLING_TYPE_CLS = 2,
471+
# LLAMA_POOLING_TYPE_LAST = 3,
469472
# };
470473
LLAMA_POOLING_TYPE_UNSPECIFIED = -1
471474
LLAMA_POOLING_TYPE_NONE = 0
472475
LLAMA_POOLING_TYPE_MEAN = 1
473476
LLAMA_POOLING_TYPE_CLS = 2
477+
LLAMA_POOLING_TYPE_LAST = 3
474478

475479
# enum llama_split_mode {
476480
# LLAMA_SPLIT_MODE_NONE = 0, // single GPU
@@ -759,7 +763,6 @@ class llama_model_params(ctypes.Structure):
759763

760764
# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
761765
# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
762-
# // (ignored if no pooling layer)
763766

764767
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
765768
# float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -2314,6 +2317,16 @@ def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
23142317
...
23152318

23162319

2320+
# // Set whether the model is in embeddings model or not
2321+
# // If true, embeddings will be returned but logits will not
2322+
# LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
2323+
@ctypes_function("llama_set_embeddings", [llama_context_p_ctypes, ctypes.c_bool], None)
2324+
def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /):
2325+
"""Set whether the model is in embeddings model or not
2326+
If true, embeddings will be returned but logits will not"""
2327+
...
2328+
2329+
23172330
# // Set whether to use causal attention or not
23182331
# // If set to true, the model will only attend to the past tokens
23192332
# LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);

0 commit comments

Comments
 (0)