File parsing and memory usage optimization (#74)
* Rework the entire file parsing system prepare for future changes * Estimate memory usage perfectly Removes whatever issue with small models that used to exist * Fix file stream ops on macOS for me this compiles on Windows 11, Ubuntu 20.04, and macOS 10.14 * Fix rwkv.cpp for non-WIN32 MSVC invocations like bindgen-rs * Implement Q8_1 quantization ...and disable the type, because GGML doesn't support the ops required to run inference with it. It's not worth any nasty hacks or workarounds right now, Q8_0 is very very similar if one wants 8-bit quantization. * Completely remove Q8_1 type This type isn't meant to be user-facing in any way so I may as well get rid of it now since it will probably never exist as a data format. * Switch from std::vector to unique array for model layers These don't ever need to be resized * Factor ffn.key.weight height into memory estimate some models have this set weirdly, in various different ways. just give up and record the actual size of it and use that * Make a few more operations inplace ggml doesn't currently expose most of the stuff it supports, so force some things. not 100% sure about this, I don't think the memory savings are that worth it * attempt a perfect upper bound size for the scratch space This should be the largest work_size seen in any model, since it is always larger than any of the other paramters except vocab (which does not participate in the graph work size). * Revert "Make a few more operations inplace" This reverts commit f94d6eb216040ae0ad23d2b9c87fae8349882f89. * Make less calls to fread micro-optimization * Fix memory size estimation for smaller models ggml works with some larger formats internally * print location in all assert macros * remove trailing whitespace * add type_to_string entry for unknown * Simplify quantization a bit * fix cuBLAS compatibility adding n_gpu_layers to rwkv_init_from_file won't work. add an extra function instead * fix quantize * quantize: don't create output file if opening input fails * Rename gpu offload layers might want to avoid branding it with cublas in case we add something like clblast support in the future * Remove old read_int32 and write_int32 functions It's all uints now * Remove static from things * Only call gpu_offload_layers if gpu_layer_count > 0 * Add rwkv_ prefix to all structures * Braces * Functions naming convention * Remove blank line after comment * Capitalize comments * Re-add quantize explanatory comment * Re-add histogram comment * Convert all error messages to uppercase * Make type conversions extern for ffi bindings from other langs * Name the state parts The code in rwkv_eval to initialize the state (when state_in is NULL) was getting very confusing so I just put everything in a struct to name it. * Fnvalid
This commit is contained in:
parent
241350fde6
commit
363dfb1a06
7
rwkv.h
7
rwkv.h
|
@ -83,8 +83,11 @@ extern "C" {
|
|||
// Returns NULL on any error. Error messages would be printed to stderr.
|
||||
// - model_file_path: path to model file in ggml format.
|
||||
// - n_threads: count of threads to use, must be positive.
|
||||
// - n_gpu_layer: count of layers need to load to gpu (only works when cuBLAS is on)
|
||||
RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads, const uint32_t n_gpu_layers);
|
||||
RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads);
|
||||
|
||||
// Offloads specified layers of context onto GPU using cuBLAS, if it is enabled.
|
||||
// If rwkv.cpp was compiled without cuBLAS support, this function is a no-op.
|
||||
RWKV_API bool rwkv_gpu_offload_layers(const struct rwkv_context * ctx, const uint32_t n_gpu_layers);
|
||||
|
||||
// Evaluates the model for a single token.
|
||||
// Returns false on any error. Error messages would be printed to stderr.
|
||||
|
|
|
@ -32,11 +32,14 @@ class RWKVModel:
|
|||
|
||||
assert os.path.isfile(model_path), f'{model_path} is not a file'
|
||||
assert thread_count > 0, 'Thread count must be positive'
|
||||
assert gpu_layers_count > 0, 'GPU layers count must be positive'
|
||||
assert gpu_layers_count >= 0, 'GPU layers count must be >= 0'
|
||||
|
||||
self._library = shared_library
|
||||
|
||||
self._ctx = self._library.rwkv_init_from_file(model_path, thread_count, gpu_layers_count)
|
||||
self._ctx = self._library.rwkv_init_from_file(model_path, thread_count)
|
||||
|
||||
if gpu_layers_count > 0:
|
||||
self._library.rwkv_gpu_offload_layers(self._ctx, gpu_layers_count)
|
||||
|
||||
self._state_buffer_element_count = self._library.rwkv_get_state_buffer_element_count(self._ctx)
|
||||
self._logits_buffer_element_count = self._library.rwkv_get_logits_buffer_element_count(self._ctx)
|
||||
|
|
|
@ -37,9 +37,12 @@ class RWKVSharedLibrary:
|
|||
|
||||
self.library = ctypes.cdll.LoadLibrary(shared_library_path)
|
||||
|
||||
self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32, ctypes.c_uint32]
|
||||
self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32]
|
||||
self.library.rwkv_init_from_file.restype = ctypes.c_void_p
|
||||
|
||||
self.library.rwkv_gpu_offload_layers.argtypes = [ctypes.c_void_p, ctypes.c_uint32]
|
||||
self.library.rwkv_gpu_offload_layers.restype = ctypes.c_bool
|
||||
|
||||
self.library.rwkv_eval.argtypes = [
|
||||
ctypes.c_void_p, # ctx
|
||||
ctypes.c_int32, # token
|
||||
|
@ -67,7 +70,7 @@ class RWKVSharedLibrary:
|
|||
self.library.rwkv_get_system_info_string.argtypes = []
|
||||
self.library.rwkv_get_system_info_string.restype = ctypes.c_char_p
|
||||
|
||||
def rwkv_init_from_file(self, model_file_path: str, thread_count: int, gpu_layers_count: int) -> RWKVContext:
|
||||
def rwkv_init_from_file(self, model_file_path: str, thread_count: int) -> RWKVContext:
|
||||
"""
|
||||
Loads the model from a file and prepares it for inference.
|
||||
Throws an exception in case of any error. Error messages would be printed to stderr.
|
||||
|
@ -83,11 +86,23 @@ class RWKVSharedLibrary:
|
|||
"""
|
||||
|
||||
ptr = self.library.rwkv_init_from_file(model_file_path.encode('utf-8'),
|
||||
ctypes.c_uint32(thread_count),
|
||||
ctypes.c_uint32(gpu_layers_count))
|
||||
ctypes.c_uint32(thread_count))
|
||||
assert ptr is not None, 'rwkv_init_from_file failed, check stderr'
|
||||
return RWKVContext(ptr)
|
||||
|
||||
def rwkv_gpu_offload_layers(self, ctx: RWKVContext, gpu_layers_count: int) -> None:
|
||||
"""
|
||||
Offloads specified layers of context onto GPU using cuBLAS, if it is enabled.
|
||||
If rwkv.cpp was compiled without cuBLAS support, this function is a no-op.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
gpu_layers_count : int
|
||||
Count of layers to load onto gpu, must be >= 0, only enabled with cuBLAS.
|
||||
"""
|
||||
|
||||
assert self.library.rwkv_gpu_offload_layers(ctx.ptr, ctypes.c_uint32(gpu_layers_count)), 'rwkv_gpu_offload_layers failed, check stderr'
|
||||
|
||||
def rwkv_eval(
|
||||
self,
|
||||
ctx: RWKVContext,
|
||||
|
|
|
@ -26,9 +26,12 @@
|
|||
void test_model(const char * model_path, const float * expected_logits, const float max_diff) {
|
||||
fprintf(stderr, "Testing %s\n", model_path);
|
||||
|
||||
struct rwkv_context * model = rwkv_init_from_file(model_path, N_THREADS, N_GPU_LAYERS);
|
||||
struct rwkv_context * model = rwkv_init_from_file(model_path, N_THREADS);
|
||||
enum rwkv_error_flags error = rwkv_get_last_error(NULL);
|
||||
ASSERT(error == 0, "Unexpected error %d", error);
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
ASSERT(rwkv_gpu_offload_layers(model, N_GPU_LAYERS), "Unexpected error %d", rwkv_get_last_error(model));
|
||||
#endif
|
||||
|
||||
uint32_t n_vocab = rwkv_get_logits_buffer_element_count(model);
|
||||
|
||||
|
|
Loading…
Reference in New Issue