Skip to content

Commit 3434803

Browse files
committed
Merge branch 'main' into v0.2-wip
2 parents 77c9f49 + c7c700b commit 3434803

File tree

5 files changed

+43
-2
lines changed

5 files changed

+43
-2
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.1.77]
11+
12+
- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B
13+
- (server) Add temporary n_gqa and rms_norm_eps parameters required for LLaMa 2 70B
14+
1015
## [0.1.76]
1116

1217
- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,14 @@ For instance, if you want to work with larger contexts, you can expand the conte
135135
llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048)
136136
```
137137

138+
### Loading llama-2 70b
139+
140+
Llama2 70b must set the `n_gqa` parameter (grouped-query attention factor) to 8 when loading:
141+
142+
```python
143+
llm = Llama(model_path="./models/7B/ggml-model.bin", n_gqa=8)
144+
```
145+
138146
## Web Server
139147

140148
`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.

llama_cpp/llama.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,8 @@ def __init__(
230230
tensor_split: Optional[List[float]] = None,
231231
rope_freq_base: float = 10000.0,
232232
rope_freq_scale: float = 1.0,
233+
n_gqa: Optional[int] = None, # (TEMPORARY) must be 8 for llama2 70b
234+
rms_norm_eps: Optional[float] = None, # (TEMPORARY)
233235
verbose: bool = True,
234236
):
235237
"""Load a llama.cpp model from `model_path`.
@@ -291,6 +293,12 @@ def __init__(
291293
self.params.rope_freq_base = rope_freq_base
292294
self.params.rope_freq_scale = rope_freq_scale
293295

296+
if n_gqa is not None:
297+
self.params.n_gqa = n_gqa
298+
299+
if rms_norm_eps is not None:
300+
self.params.rms_norm_eps = rms_norm_eps
301+
294302
self.last_n_tokens_size = last_n_tokens_size
295303
self.n_batch = min(n_ctx, n_batch)
296304

@@ -1530,6 +1538,10 @@ def __getstate__(self):
15301538
lora_base=self.lora_base,
15311539
lora_path=self.lora_path,
15321540
tensor_split=self.tensor_split,
1541+
### TEMPORARY ###
1542+
n_gqa=self.params.n_gqa,
1543+
rms_norm_eps=self.params.rms_norm_eps,
1544+
### TEMPORARY ###
15331545
### DEPRECATED ###
15341546
n_parts=self.n_parts,
15351547
### DEPRECATED ###
@@ -1539,7 +1551,6 @@ def __setstate__(self, state):
15391551
self.__init__(
15401552
model_path=state["model_path"],
15411553
n_ctx=state["n_ctx"],
1542-
n_parts=state["n_parts"],
15431554
n_gpu_layers=state["n_gpu_layers"],
15441555
seed=state["seed"],
15451556
f16_kv=state["f16_kv"],
@@ -1556,6 +1567,13 @@ def __setstate__(self, state):
15561567
lora_path=state["lora_path"],
15571568
tensor_split=state["tensor_split"],
15581569
verbose=state["verbose"],
1570+
### TEMPORARY ###
1571+
n_gqa=state["n_gqa"],
1572+
rms_norm_eps=state["rms_norm_eps"],
1573+
### TEMPORARY ###
1574+
### DEPRECATED ###
1575+
n_parts=state["n_parts"],
1576+
### DEPRECATED ###
15591577
)
15601578

15611579
def save_state(self) -> LlamaState:

llama_cpp/server/app.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,14 @@ class Settings(BaseSettings):
100100
default=True,
101101
description="Whether to interrupt requests when a new request is received.",
102102
)
103+
n_gqa: Optional[int] = Field(
104+
default=None,
105+
description="TEMPORARY: Set to 8 for Llama2 70B",
106+
)
107+
rms_norm_eps: Optional[float] = Field(
108+
default=None,
109+
description="TEMPORARY",
110+
)
103111

104112

105113
class ErrorResponse(TypedDict):
@@ -325,6 +333,8 @@ def create_app(settings: Optional[Settings] = None):
325333
last_n_tokens_size=settings.last_n_tokens_size,
326334
vocab_only=settings.vocab_only,
327335
verbose=settings.verbose,
336+
n_gqa=settings.n_gqa,
337+
rms_norm_eps=settings.rms_norm_eps,
328338
)
329339
if settings.cache:
330340
if settings.cache_type == "disk":

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
44

55
[project]
66
name = "llama_cpp_python"
7-
version = "0.1.76"
7+
version = "0.1.77"
88
description = "Python bindings for the llama.cpp library"
99
readme = "README.md"
1010
license = { text = "MIT" }

0 commit comments

Comments
 (0)