Skip to content

Commit 753dfbc

Browse files
committed
Update notebook
1 parent 7ff8508 commit 753dfbc

File tree

1 file changed

+40
-16
lines changed

1 file changed

+40
-16
lines changed

examples/notebooks/Batching.ipynb

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
"name": "stderr",
1919
"output_type": "stream",
2020
"text": [
21+
"ggml_init_cublas: GGML_CUDA_FORCE_MMQ: no\n",
22+
"ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes\n",
2123
"ggml_init_cublas: found 1 CUDA devices:\n",
2224
" Device 0: NVIDIA GeForce RTX 2060, compute capability 7.5\n"
2325
]
@@ -36,7 +38,7 @@
3638
"name": "stderr",
3739
"output_type": "stream",
3840
"text": [
39-
"llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from ../../models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf (version GGUF V2 (latest))\n",
41+
"llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from ../../models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf (version GGUF V2)\n",
4042
"llama_model_loader: - tensor 0: token_embd.weight q4_K [ 4096, 32000, 1, 1 ]\n",
4143
"llama_model_loader: - tensor 1: output_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
4244
"llama_model_loader: - tensor 2: output.weight q6_K [ 4096, 32000, 1, 1 ]\n",
@@ -347,7 +349,8 @@
347349
"llama_model_loader: - type f32: 65 tensors\n",
348350
"llama_model_loader: - type q4_K: 193 tensors\n",
349351
"llama_model_loader: - type q6_K: 33 tensors\n",
350-
"llm_load_print_meta: format = GGUF V2 (latest)\n",
352+
"llm_load_vocab: special tokens definition check successful ( 259/32000 ).\n",
353+
"llm_load_print_meta: format = GGUF V2\n",
351354
"llm_load_print_meta: arch = llama\n",
352355
"llm_load_print_meta: vocab type = SPM\n",
353356
"llm_load_print_meta: n_vocab = 32000\n",
@@ -361,6 +364,8 @@
361364
"llm_load_print_meta: n_gqa = 4\n",
362365
"llm_load_print_meta: f_norm_eps = 0.0e+00\n",
363366
"llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n",
367+
"llm_load_print_meta: f_clamp_kqv = 0.0e+00\n",
368+
"llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
364369
"llm_load_print_meta: n_ff = 14336\n",
365370
"llm_load_print_meta: freq_base_train = 10000.0\n",
366371
"llm_load_print_meta: freq_scale_train = 1\n",
@@ -373,7 +378,7 @@
373378
"llm_load_print_meta: EOS token = 2 '</s>'\n",
374379
"llm_load_print_meta: UNK token = 0 '<unk>'\n",
375380
"llm_load_print_meta: LF token = 13 '<0x0A>'\n",
376-
"llm_load_tensors: ggml ctx size = 0.09 MB\n",
381+
"llm_load_tensors: ggml ctx size = 0.10 MB\n",
377382
"llm_load_tensors: using CUDA for GPU acceleration\n",
378383
"llm_load_tensors: mem required = 70.41 MB\n",
379384
"llm_load_tensors: offloading 32 repeating layers to GPU\n",
@@ -399,7 +404,7 @@
399404
"name": "stdout",
400405
"output_type": "stream",
401406
"text": [
402-
"[1, 415, 2936, 9060, 285, 1142]\n",
407+
"[1, 1014, 2936, 9060, 285, 1142]\n",
403408
"58\n"
404409
]
405410
}
@@ -411,7 +416,7 @@
411416
"prompt = b\"The quick brown fox\"\n",
412417
"\n",
413418
"tokens = (llama_cpp.llama_token * n_ctx)()\n",
414-
"tokens_len = llama_cpp.llama_tokenize(model, prompt, len(prompt), tokens, len(tokens), True)\n",
419+
"tokens_len = llama_cpp.llama_tokenize(model, prompt, len(prompt), tokens, len(tokens), True, True)\n",
415420
"print(tokens[:tokens_len])\n",
416421
"\n",
417422
"n_kv_req = tokens_len + (n_len - tokens_len) * n_parallel\n",
@@ -434,7 +439,8 @@
434439
"llama_kv_cache_init: offloading k cache to GPU\n",
435440
"llama_kv_cache_init: VRAM kv self = 7.25 MB\n",
436441
"llama_new_context_with_model: kv self size = 7.25 MB\n",
437-
"llama_new_context_with_model: compute buffer total size = 10.38 MB\n",
442+
"llama_build_graph: non-view tensors processed: 740/740\n",
443+
"llama_new_context_with_model: compute buffer total size = 10.63 MB\n",
438444
"llama_new_context_with_model: VRAM scratch buffer: 4.51 MB\n",
439445
"llama_new_context_with_model: total VRAM used: 4106.81 MB (model: 4095.05 MB, context: 11.76 MB)\n"
440446
]
@@ -458,13 +464,23 @@
458464
"outputs": [],
459465
"source": [
460466
"n_ctx = llama_cpp.llama_n_ctx(ctx)\n",
461-
"batch = llama_cpp.llama_batch_init(max(tokens_len, n_parallel), 0)\n",
467+
"batch = llama_cpp.llama_batch_init(max(tokens_len, n_parallel), 0, 1)"
468+
]
469+
},
470+
{
471+
"cell_type": "code",
472+
"execution_count": 7,
473+
"metadata": {},
474+
"outputs": [],
475+
"source": [
476+
"import ctypes\n",
462477
"\n",
463478
"batch.n_tokens = tokens_len\n",
464479
"for i in range(tokens_len):\n",
465480
" batch.token[i] = tokens[i]\n",
466481
" batch.pos[i] = i\n",
467-
" batch.seq_id[i] = 0\n",
482+
" batch.seq_id[i] = (ctypes.c_int32 * 1)(0)\n",
483+
" batch.n_seq_id[i] = 1\n",
468484
" batch.logits[i] = False\n",
469485
"\n",
470486
"batch.logits[batch.n_tokens - 1] = True\n",
@@ -475,7 +491,7 @@
475491
},
476492
{
477493
"cell_type": "code",
478-
"execution_count": 7,
494+
"execution_count": 8,
479495
"metadata": {},
480496
"outputs": [],
481497
"source": [
@@ -485,7 +501,7 @@
485501
},
486502
{
487503
"cell_type": "code",
488-
"execution_count": 8,
504+
"execution_count": 9,
489505
"metadata": {},
490506
"outputs": [
491507
{
@@ -594,7 +610,8 @@
594610
"\n",
595611
" batch.token[batch.n_tokens] = new_token_id\n",
596612
" batch.pos[batch.n_tokens] = n_cur\n",
597-
" batch.seq_id[batch.n_tokens] = i\n",
613+
" batch.seq_id[batch.n_tokens] = (ctypes.c_int32 * 1)(i)\n",
614+
" batch.n_seq_id[batch.n_tokens] = 1\n",
598615
" batch.logits[batch.n_tokens] = True\n",
599616
"\n",
600617
" i_batch[i] = batch.n_tokens\n",
@@ -615,7 +632,7 @@
615632
},
616633
{
617634
"cell_type": "code",
618-
"execution_count": 9,
635+
"execution_count": 10,
619636
"metadata": {},
620637
"outputs": [
621638
{
@@ -632,7 +649,7 @@
632649
},
633650
{
634651
"cell_type": "code",
635-
"execution_count": 10,
652+
"execution_count": 11,
636653
"metadata": {},
637654
"outputs": [],
638655
"source": [
@@ -641,7 +658,7 @@
641658
},
642659
{
643660
"cell_type": "code",
644-
"execution_count": 11,
661+
"execution_count": 12,
645662
"metadata": {},
646663
"outputs": [],
647664
"source": [
@@ -650,7 +667,7 @@
650667
},
651668
{
652669
"cell_type": "code",
653-
"execution_count": 12,
670+
"execution_count": 13,
654671
"metadata": {},
655672
"outputs": [],
656673
"source": [
@@ -659,13 +676,20 @@
659676
},
660677
{
661678
"cell_type": "code",
662-
"execution_count": 13,
679+
"execution_count": 14,
663680
"metadata": {},
664681
"outputs": [],
665682
"source": [
666683
"llama_cpp.llama_backend_free()"
667684
]
668685
},
686+
{
687+
"cell_type": "code",
688+
"execution_count": null,
689+
"metadata": {},
690+
"outputs": [],
691+
"source": []
692+
},
669693
{
670694
"cell_type": "code",
671695
"execution_count": null,

0 commit comments

Comments
 (0)