Merge branch 'abetlen:main' into main

hariag · web-flow · commit 328cd0d85b8b · 2024-10-08T13:47:27.000+08:00
diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
@@ -92,7 +92,7 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: "recursive"
-          
+
       - uses: actions/setup-python@v5
         with:
           python-version: "3.9"
@@ -103,6 +103,7 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install uv
           RUST_LOG=trace python -m uv pip install -e .[all] --verbose
+          python -m uv pip install build
         shell: bash
 
       - name: Install dependencies (Windows)
@@ -113,12 +114,13 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install uv
           python -m uv pip install -e .[all] --verbose
+          python -m uv pip install build
         shell: cmd
-          
+
       - name: Build source distribution
         run: |
           python -m build --sdist
-          
+
       - uses: actions/upload-artifact@v4
         with:
           name: sdist
diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
@@ -13,34 +13,36 @@ jobs:
     - uses: actions/checkout@v4
       with:
         submodules: "recursive"
-        
+
     - name: Set up Python
       uses: actions/setup-python@v5
       with:
         python-version: "3.9"
-        
+
     - name: Install dependencies (Linux/MacOS)
       if: runner.os != 'Windows'
       run: |
         python -m pip install --upgrade pip
         python -m pip install uv
         RUST_LOG=trace python -m uv pip install -e .[all] --verbose
+        python -m uv pip install build
       shell: bash
 
     - name: Install dependencies (Windows)
       if: runner.os == 'Windows'
       env:
-        RUST_LOG: trace        
+        RUST_LOG: trace
       run: |
         python -m pip install --upgrade pip
         python -m pip install uv
         python -m uv pip install -e .[all] --verbose
+        python -m uv pip install build
       shell: cmd
-        
+
     - name: Build source distribution
       run: |
         python -m build --sdist
-        
+
     - name: Publish distribution to PyPI
       # TODO: move to tag based releases
       # if: startsWith(github.ref, 'refs/tags')
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.1]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@c919d5db39c8a7fcb64737f008e4b105ee0acd20
+- feat: Expose libggml in internal APIs by @abetlen in #1761
+- fix: Fix speculative decoding by @abetlen in 9992c5084a3df2f533e265d10f81d4269b97a1e6 and e975dabf74b3ad85689c9a07719cbb181313139b
+- misc: Rename all_text to remaining_text by @xu-song in #1658
+
+## [0.3.0]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@ea9c32be71b91b42ecc538bd902e93cbb5fb36cb
+- feat: Enable detokenizing special tokens with special=True by @benniekiss in #1596
+- feat(ci): Speed up CI workflows using uv, add support for CUDA 12.5 wheels by @Smartappli in e529940f45d42ed8aa31334123b8d66bc67b0e78
+- feat: Add loading sharded GGUF files from HuggingFace with Llama.from_pretrained(additional_files=[...]) by @Gnurro in 84c092063e8f222758dd3d60bdb2d1d342ac292e
+- feat: Add option to configure n_ubatch by @abetlen in 6c44a3f36b089239cb6396bb408116aad262c702
+- feat: Update sampling API for llama.cpp. Sampling now uses sampler chain by @abetlen in f8fcb3ea3424bcfba3a5437626a994771a02324b
+- fix: Don't store scores internally unless logits_all=True. Reduces memory requirements for large context by @abetlen in 29afcfdff5e75d7df4c13bad0122c98661d251ab
+- fix: Fix memory allocation of ndarray in by @xu-song in #1704
+- fix: Use system message in og qwen format by @abetlen in 98eb092d3c6e7c142c4ba2faaca6c091718abbb3
+
+
 ## [0.2.90]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@1d1ccce67613674c75c9c7e3fa4c1e24e428ba48
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.90"
+__version__ = "0.3.1"
diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py
@@ -0,0 +1,131 @@
+from __future__ import annotations
+
+import sys
+import os
+import ctypes
+import functools
+import pathlib
+
+from typing import (
+    Any,
+    Callable,
+    List,
+    Union,
+    Optional,
+    TYPE_CHECKING,
+    TypeVar,
+    Generic,
+)
+from typing_extensions import TypeAlias
+
+
+# Load the library
+def load_shared_library(lib_base_name: str, base_path: pathlib.Path):
+    """Platform independent shared library loader"""
+    # Searching for the library in the current directory under the name "libllama" (default name
+    # for llamacpp) and "llama" (default name for this repo)
+    lib_paths: List[pathlib.Path] = []
+    # Determine the file extension based on the platform
+    if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"):
+        lib_paths += [
+            base_path / f"lib{lib_base_name}.so",
+        ]
+    elif sys.platform == "darwin":
+        lib_paths += [
+            base_path / f"lib{lib_base_name}.so",
+            base_path / f"lib{lib_base_name}.dylib",
+        ]
+    elif sys.platform == "win32":
+        lib_paths += [
+            base_path / f"{lib_base_name}.dll",
+            base_path / f"lib{lib_base_name}.dll",
+        ]
+    else:
+        raise RuntimeError("Unsupported platform")
+
+    cdll_args = dict()  # type: ignore
+
+    # Add the library directory to the DLL search path on Windows (if needed)
+    if sys.platform == "win32":
+        os.add_dll_directory(str(base_path))
+        os.environ["PATH"] = str(base_path) + os.pathsep + os.environ["PATH"]
+
+    if sys.platform == "win32" and sys.version_info >= (3, 8):
+        os.add_dll_directory(str(base_path))
+        if "CUDA_PATH" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
+        if "HIP_PATH" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin"))
+            os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib"))
+        cdll_args["winmode"] = ctypes.RTLD_GLOBAL
+
+    # Try to load the shared library, handling potential errors
+    for lib_path in lib_paths:
+        if lib_path.exists():
+            try:
+                return ctypes.CDLL(str(lib_path), **cdll_args)  # type: ignore
+            except Exception as e:
+                raise RuntimeError(f"Failed to load shared library '{lib_path}': {e}")
+
+    raise FileNotFoundError(
+        f"Shared library with base name '{lib_base_name}' not found"
+    )
+
+
+# ctypes sane type hint helpers
+#
+# - Generic Pointer and Array types
+# - PointerOrRef type with a type hinted byref function
+#
+# NOTE: Only use these for static type checking not for runtime checks
+# no good will come of that
+
+if TYPE_CHECKING:
+    CtypesCData = TypeVar("CtypesCData", bound=ctypes._CData)  # type: ignore
+
+    CtypesArray: TypeAlias = ctypes.Array[CtypesCData]  # type: ignore
+
+    CtypesPointer: TypeAlias = ctypes._Pointer[CtypesCData]  # type: ignore
+
+    CtypesVoidPointer: TypeAlias = ctypes.c_void_p
+
+    class CtypesRef(Generic[CtypesCData]):
+        pass
+
+    CtypesPointerOrRef: TypeAlias = Union[
+        CtypesPointer[CtypesCData], CtypesRef[CtypesCData]
+    ]
+
+    CtypesFuncPointer: TypeAlias = ctypes._FuncPointer  # type: ignore
+
+F = TypeVar("F", bound=Callable[..., Any])
+
+
+def ctypes_function_for_shared_library(lib: ctypes.CDLL):
+    """Decorator for defining ctypes functions with type hints"""
+
+    def ctypes_function(
+        name: str, argtypes: List[Any], restype: Any, enabled: bool = True
+    ):
+        def decorator(f: F) -> F:
+            if enabled:
+                func = getattr(lib, name)
+                func.argtypes = argtypes
+                func.restype = restype
+                functools.wraps(f)(func)
+                return func
+            else:
+                return f
+
+        return decorator
+
+    return ctypes_function
+
+
+def _byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCData]:
+    """Type-annotated version of ctypes.byref"""
+    ...
+
+
+byref = _byref if TYPE_CHECKING else ctypes.byref
diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py
@@ -0,0 +1,12 @@
+"""Internal module use at your own risk
+
+This module provides a minimal interface for working with ggml tensors from llama-cpp-python
+"""
+import os
+import pathlib
+
+import llama_cpp._ctypes_extensions as ctypes_ext
+
+libggml_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
+libggml = ctypes_ext.load_shared_library("ggml", libggml_base_path)
+
diff --git a/llama_cpp/_logger.py b/llama_cpp/_logger.py
@@ -10,17 +10,20 @@
 #     GGML_LOG_LEVEL_WARN  = 2,
 #     GGML_LOG_LEVEL_ERROR = 3,
 #     GGML_LOG_LEVEL_DEBUG = 4,
+#     GGML_LOG_LEVEL_CONT  = 5, // continue previous log
 # };
 GGML_LOG_LEVEL_TO_LOGGING_LEVEL = {
     0: logging.CRITICAL,
     1: logging.INFO,
     2: logging.WARNING,
     3: logging.ERROR,
     4: logging.DEBUG,
+    5: logging.DEBUG,
 }
 
 logger = logging.getLogger("llama-cpp-python")
 
+_last_log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[0]
 
 # typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
 @llama_cpp.llama_log_callback
@@ -29,8 +32,12 @@ def llama_log_callback(
     text: bytes,
     user_data: ctypes.c_void_p,
 ):
+    # TODO: Correctly implement continue previous log
+    global _last_log_level
+    log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level] if level != 5 else _last_log_level
     if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]:
         print(text.decode("utf-8"), end="", flush=True, file=sys.stderr)
+    _last_log_level = log_level
 
 
 llama_cpp.llama_log_set(llama_log_callback, ctypes.c_void_p(0))
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -807,8 +807,10 @@ def sample(
                 grammar=grammar,
             )
 
+        ridx = idx - self.n_tokens if idx is not None else -1
+
         assert self.ctx is not None
-        token = self._sampler.sample(self._ctx, -1)
+        token = self._sampler.sample(self._ctx, ridx)
         if tmp_sampler:
             self._sampler = None
         return token
@@ -928,7 +930,7 @@ def generate(
 
                 sample_idx += 1
                 if stopping_criteria is not None and stopping_criteria(
-                    self._input_ids, self._scores[-1, :]
+                    self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :]
                 ):
                     return
                 tokens_or_none = yield token
@@ -1517,15 +1519,15 @@ def logit_bias_processor(
 
         if stream:
             remaining_tokens = completion_tokens[returned_tokens:]
-            all_text = self.detokenize(
+            remaining_text = self.detokenize(
                 remaining_tokens,
                 prev_tokens=prompt_tokens + completion_tokens[:returned_tokens],
             )
-            any_stop = [s for s in stop_sequences if s in all_text]
+            any_stop = [s for s in stop_sequences if s in remaining_text]
             if len(any_stop) > 0:
-                end = min(all_text.index(stop) for stop in any_stop)
+                end = min(remaining_text.index(stop) for stop in any_stop)
             else:
-                end = len(all_text)
+                end = len(remaining_text)
 
             token_end_position = 0
             for token in remaining_tokens:
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
diff --git a/vendor/llama.cpp b/vendor/llama.cpp