|
18 | 18 | "name": "stderr",
|
19 | 19 | "output_type": "stream",
|
20 | 20 | "text": [
|
| 21 | + "ggml_init_cublas: GGML_CUDA_FORCE_MMQ: no\n", |
| 22 | + "ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes\n", |
21 | 23 | "ggml_init_cublas: found 1 CUDA devices:\n",
|
22 | 24 | " Device 0: NVIDIA GeForce RTX 2060, compute capability 7.5\n"
|
23 | 25 | ]
|
|
36 | 38 | "name": "stderr",
|
37 | 39 | "output_type": "stream",
|
38 | 40 | "text": [
|
39 |
| - "llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from ../../models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf (version GGUF V2 (latest))\n", |
| 41 | + "llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from ../../models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf (version GGUF V2)\n", |
40 | 42 | "llama_model_loader: - tensor 0: token_embd.weight q4_K [ 4096, 32000, 1, 1 ]\n",
|
41 | 43 | "llama_model_loader: - tensor 1: output_norm.weight f32 [ 4096, 1, 1, 1 ]\n",
|
42 | 44 | "llama_model_loader: - tensor 2: output.weight q6_K [ 4096, 32000, 1, 1 ]\n",
|
|
347 | 349 | "llama_model_loader: - type f32: 65 tensors\n",
|
348 | 350 | "llama_model_loader: - type q4_K: 193 tensors\n",
|
349 | 351 | "llama_model_loader: - type q6_K: 33 tensors\n",
|
350 |
| - "llm_load_print_meta: format = GGUF V2 (latest)\n", |
| 352 | + "llm_load_vocab: special tokens definition check successful ( 259/32000 ).\n", |
| 353 | + "llm_load_print_meta: format = GGUF V2\n", |
351 | 354 | "llm_load_print_meta: arch = llama\n",
|
352 | 355 | "llm_load_print_meta: vocab type = SPM\n",
|
353 | 356 | "llm_load_print_meta: n_vocab = 32000\n",
|
|
361 | 364 | "llm_load_print_meta: n_gqa = 4\n",
|
362 | 365 | "llm_load_print_meta: f_norm_eps = 0.0e+00\n",
|
363 | 366 | "llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n",
|
| 367 | + "llm_load_print_meta: f_clamp_kqv = 0.0e+00\n", |
| 368 | + "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n", |
364 | 369 | "llm_load_print_meta: n_ff = 14336\n",
|
365 | 370 | "llm_load_print_meta: freq_base_train = 10000.0\n",
|
366 | 371 | "llm_load_print_meta: freq_scale_train = 1\n",
|
|
373 | 378 | "llm_load_print_meta: EOS token = 2 '</s>'\n",
|
374 | 379 | "llm_load_print_meta: UNK token = 0 '<unk>'\n",
|
375 | 380 | "llm_load_print_meta: LF token = 13 '<0x0A>'\n",
|
376 |
| - "llm_load_tensors: ggml ctx size = 0.09 MB\n", |
| 381 | + "llm_load_tensors: ggml ctx size = 0.10 MB\n", |
377 | 382 | "llm_load_tensors: using CUDA for GPU acceleration\n",
|
378 | 383 | "llm_load_tensors: mem required = 70.41 MB\n",
|
379 | 384 | "llm_load_tensors: offloading 32 repeating layers to GPU\n",
|
|
399 | 404 | "name": "stdout",
|
400 | 405 | "output_type": "stream",
|
401 | 406 | "text": [
|
402 |
| - "[1, 415, 2936, 9060, 285, 1142]\n", |
| 407 | + "[1, 1014, 2936, 9060, 285, 1142]\n", |
403 | 408 | "58\n"
|
404 | 409 | ]
|
405 | 410 | }
|
|
411 | 416 | "prompt = b\"The quick brown fox\"\n",
|
412 | 417 | "\n",
|
413 | 418 | "tokens = (llama_cpp.llama_token * n_ctx)()\n",
|
414 |
| - "tokens_len = llama_cpp.llama_tokenize(model, prompt, len(prompt), tokens, len(tokens), True)\n", |
| 419 | + "tokens_len = llama_cpp.llama_tokenize(model, prompt, len(prompt), tokens, len(tokens), True, True)\n", |
415 | 420 | "print(tokens[:tokens_len])\n",
|
416 | 421 | "\n",
|
417 | 422 | "n_kv_req = tokens_len + (n_len - tokens_len) * n_parallel\n",
|
|
434 | 439 | "llama_kv_cache_init: offloading k cache to GPU\n",
|
435 | 440 | "llama_kv_cache_init: VRAM kv self = 7.25 MB\n",
|
436 | 441 | "llama_new_context_with_model: kv self size = 7.25 MB\n",
|
437 |
| - "llama_new_context_with_model: compute buffer total size = 10.38 MB\n", |
| 442 | + "llama_build_graph: non-view tensors processed: 740/740\n", |
| 443 | + "llama_new_context_with_model: compute buffer total size = 10.63 MB\n", |
438 | 444 | "llama_new_context_with_model: VRAM scratch buffer: 4.51 MB\n",
|
439 | 445 | "llama_new_context_with_model: total VRAM used: 4106.81 MB (model: 4095.05 MB, context: 11.76 MB)\n"
|
440 | 446 | ]
|
|
458 | 464 | "outputs": [],
|
459 | 465 | "source": [
|
460 | 466 | "n_ctx = llama_cpp.llama_n_ctx(ctx)\n",
|
461 |
| - "batch = llama_cpp.llama_batch_init(max(tokens_len, n_parallel), 0)\n", |
| 467 | + "batch = llama_cpp.llama_batch_init(max(tokens_len, n_parallel), 0, 1)" |
| 468 | + ] |
| 469 | + }, |
| 470 | + { |
| 471 | + "cell_type": "code", |
| 472 | + "execution_count": 7, |
| 473 | + "metadata": {}, |
| 474 | + "outputs": [], |
| 475 | + "source": [ |
| 476 | + "import ctypes\n", |
462 | 477 | "\n",
|
463 | 478 | "batch.n_tokens = tokens_len\n",
|
464 | 479 | "for i in range(tokens_len):\n",
|
465 | 480 | " batch.token[i] = tokens[i]\n",
|
466 | 481 | " batch.pos[i] = i\n",
|
467 |
| - " batch.seq_id[i] = 0\n", |
| 482 | + " batch.seq_id[i] = (ctypes.c_int32 * 1)(0)\n", |
| 483 | + " batch.n_seq_id[i] = 1\n", |
468 | 484 | " batch.logits[i] = False\n",
|
469 | 485 | "\n",
|
470 | 486 | "batch.logits[batch.n_tokens - 1] = True\n",
|
|
475 | 491 | },
|
476 | 492 | {
|
477 | 493 | "cell_type": "code",
|
478 |
| - "execution_count": 7, |
| 494 | + "execution_count": 8, |
479 | 495 | "metadata": {},
|
480 | 496 | "outputs": [],
|
481 | 497 | "source": [
|
|
485 | 501 | },
|
486 | 502 | {
|
487 | 503 | "cell_type": "code",
|
488 |
| - "execution_count": 8, |
| 504 | + "execution_count": 9, |
489 | 505 | "metadata": {},
|
490 | 506 | "outputs": [
|
491 | 507 | {
|
|
594 | 610 | "\n",
|
595 | 611 | " batch.token[batch.n_tokens] = new_token_id\n",
|
596 | 612 | " batch.pos[batch.n_tokens] = n_cur\n",
|
597 |
| - " batch.seq_id[batch.n_tokens] = i\n", |
| 613 | + " batch.seq_id[batch.n_tokens] = (ctypes.c_int32 * 1)(i)\n", |
| 614 | + " batch.n_seq_id[batch.n_tokens] = 1\n", |
598 | 615 | " batch.logits[batch.n_tokens] = True\n",
|
599 | 616 | "\n",
|
600 | 617 | " i_batch[i] = batch.n_tokens\n",
|
|
615 | 632 | },
|
616 | 633 | {
|
617 | 634 | "cell_type": "code",
|
618 |
| - "execution_count": 9, |
| 635 | + "execution_count": 10, |
619 | 636 | "metadata": {},
|
620 | 637 | "outputs": [
|
621 | 638 | {
|
|
632 | 649 | },
|
633 | 650 | {
|
634 | 651 | "cell_type": "code",
|
635 |
| - "execution_count": 10, |
| 652 | + "execution_count": 11, |
636 | 653 | "metadata": {},
|
637 | 654 | "outputs": [],
|
638 | 655 | "source": [
|
|
641 | 658 | },
|
642 | 659 | {
|
643 | 660 | "cell_type": "code",
|
644 |
| - "execution_count": 11, |
| 661 | + "execution_count": 12, |
645 | 662 | "metadata": {},
|
646 | 663 | "outputs": [],
|
647 | 664 | "source": [
|
|
650 | 667 | },
|
651 | 668 | {
|
652 | 669 | "cell_type": "code",
|
653 |
| - "execution_count": 12, |
| 670 | + "execution_count": 13, |
654 | 671 | "metadata": {},
|
655 | 672 | "outputs": [],
|
656 | 673 | "source": [
|
|
659 | 676 | },
|
660 | 677 | {
|
661 | 678 | "cell_type": "code",
|
662 |
| - "execution_count": 13, |
| 679 | + "execution_count": 14, |
663 | 680 | "metadata": {},
|
664 | 681 | "outputs": [],
|
665 | 682 | "source": [
|
666 | 683 | "llama_cpp.llama_backend_free()"
|
667 | 684 | ]
|
668 | 685 | },
|
| 686 | + { |
| 687 | + "cell_type": "code", |
| 688 | + "execution_count": null, |
| 689 | + "metadata": {}, |
| 690 | + "outputs": [], |
| 691 | + "source": [] |
| 692 | + }, |
669 | 693 | {
|
670 | 694 | "cell_type": "code",
|
671 | 695 | "execution_count": null,
|
|
0 commit comments