basic integration of PowerInfer

hodlen · hodlen · commit e83c965c745a · 2023-12-23T16:13:03.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -176,3 +176,5 @@ cython_debug/
 
 # downloaded model .bin files
 docker/open_llama/*.bin
+
+/.venv/**
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "vendor/llama.cpp"]
 	path = vendor/llama.cpp
 	url = https://github.com/ggerganov/llama.cpp.git
+[submodule "vendor/PowerInfer"]
+	path = vendor/PowerInfer
+	url = https://github.com/SJTU-IPADS/PowerInfer.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,7 +16,7 @@ if (LLAMA_BUILD)
         set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
         set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
     endif()
-    add_subdirectory(vendor/llama.cpp)
+    add_subdirectory(vendor/PowerInfer)
     install(
         TARGETS llama 
         LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
@@ -44,28 +44,28 @@ if (LLAMA_BUILD)
         DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
     )
 
-    # Building llava
-    add_subdirectory(vendor/llama.cpp/examples/llava)
-    set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
-    # Set CUDA_ARCHITECTURES to OFF on windows
-    if (WIN32)
-        set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
-    endif()
-    install(
-        TARGETS llava_shared
-        LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
-        RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
-        ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
-        FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
-        RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
-    )
-    # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
-    install(
-        TARGETS llava_shared
-        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
-        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
-        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
-        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
-        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
-    )
+    # # Building llava
+    # add_subdirectory(vendor/llama.cpp/examples/llava)
+    # set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
+    # # Set CUDA_ARCHITECTURES to OFF on windows
+    # if (WIN32)
+    #     set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
+    # endif()
+    # install(
+    #     TARGETS llava_shared
+    #     LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    #     RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    #     ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    #     FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    #     RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    # )
+    # # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
+    # install(
+    #     TARGETS llava_shared
+    #     LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    #     RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    #     ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    #     FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    #     RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    # )
 endif()
diff --git a/examples/high_level_api/high_level_api_inference.py b/examples/high_level_api/high_level_api_inference.py
@@ -11,7 +11,7 @@
 
 output = llm(
     "Question: What are the names of the planets in the solar system? Answer: ",
-    max_tokens=48,
+    max_tokens=512,
     stop=["Q:", "\n"],
     echo=True,
 )
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -305,12 +305,15 @@ class llama_model_params(Structure):
     _fields_ = [
         ("n_gpu_layers", c_int32),
         ("main_gpu", c_int32),
+        ("vram_budget_gb", c_float),
         ("tensor_split", c_float_p),
         ("progress_callback", llama_progress_callback),
         ("progress_callback_user_data", c_void_p),
         ("vocab_only", c_bool),
         ("use_mmap", c_bool),
         ("use_mlock", c_bool),
+        ("reset_gpu_index", c_bool),
+        ("disable_gpu_index", c_bool),
     ]
 
 
diff --git a/vendor/PowerInfer b/vendor/PowerInfer
@@ -0,0 +1 @@
+Subproject commit 9d726685d299b669ffb6197f6b0e72b21ebf019c

Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`
`12`	`12`	`output = llm(`
`13`	`13`	`"Question: What are the names of the planets in the solar system? Answer: ",`
`14`		`- max_tokens=48,`
	`14`	`+ max_tokens=512,`
`15`	`15`	`stop=["Q:", "\n"],`
`16`	`16`	`echo=True,`
`17`	`17`	`)`