File parsing and memory usage optimization (#74)

* Rework the entire file parsing system

prepare for future changes

* Estimate memory usage perfectly

Removes whatever issue with small models that used to exist

* Fix file stream ops on macOS

for me this compiles on Windows 11, Ubuntu 20.04, and macOS 10.14

* Fix rwkv.cpp for non-WIN32 MSVC invocations like bindgen-rs

* Implement Q8_1 quantization

...and disable the type, because GGML doesn't support the ops
required to run inference with it.

It's not worth any nasty hacks or workarounds right now, Q8_0 is
very very similar if one wants 8-bit quantization.

* Completely remove Q8_1 type

This type isn't meant to be user-facing in any way so I may as well
get rid of it now since it will probably never exist as a data
format.

* Switch from std::vector to unique array for model layers

These don't ever need to be resized

* Factor ffn.key.weight height into memory estimate

some models have this set weirdly, in various different ways.
just give up and record the actual size of it and use that

* Make a few more operations inplace

ggml doesn't currently expose most of the stuff it supports, so
force some things. not 100% sure about this, I don't think the
memory savings are that worth it

* attempt a perfect upper bound size for the scratch space

This should be the largest work_size seen in any model, since it
is always larger than any of the other paramters except vocab
(which does not participate in the graph work size).

* Revert "Make a few more operations inplace"

This reverts commit f94d6eb216040ae0ad23d2b9c87fae8349882f89.

* Make less calls to fread

micro-optimization

* Fix memory size estimation for smaller models

ggml works with some larger formats internally

* print location in all assert macros

* remove trailing whitespace

* add type_to_string entry for unknown

* Simplify quantization a bit

* fix cuBLAS compatibility

adding n_gpu_layers to rwkv_init_from_file won't work.
add an extra function instead

* fix quantize

* quantize: don't create output file if opening input fails

* Rename gpu offload layers

might want to avoid branding it with cublas in case we add something
like clblast support in the future

* Remove old read_int32 and write_int32 functions

It's all uints now

* Remove static from things

* Only call gpu_offload_layers if gpu_layer_count > 0

* Add rwkv_ prefix to all structures

* Braces

* Functions naming convention

* Remove blank line after comment

* Capitalize comments

* Re-add quantize explanatory comment

* Re-add histogram comment

* Convert all error messages to uppercase

* Make type conversions extern

for ffi bindings from other langs

* Name the state parts

The code in rwkv_eval to initialize the state (when state_in is
NULL) was getting very confusing so I just put everything in a
struct to name it.

* Fnvalid
This commit is contained in:
LoganDark 2023-05-31 04:31:19 -07:00 committed by GitHub
parent 241350fde6
commit 363dfb1a06
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 949 additions and 651 deletions

1558
rwkv.cpp

File diff suppressed because it is too large Load Diff

7
rwkv.h
View File

@ -83,8 +83,11 @@ extern "C" {
// Returns NULL on any error. Error messages would be printed to stderr. // Returns NULL on any error. Error messages would be printed to stderr.
// - model_file_path: path to model file in ggml format. // - model_file_path: path to model file in ggml format.
// - n_threads: count of threads to use, must be positive. // - n_threads: count of threads to use, must be positive.
// - n_gpu_layer: count of layers need to load to gpu (only works when cuBLAS is on) RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads);
RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads, const uint32_t n_gpu_layers);
// Offloads specified layers of context onto GPU using cuBLAS, if it is enabled.
// If rwkv.cpp was compiled without cuBLAS support, this function is a no-op.
RWKV_API bool rwkv_gpu_offload_layers(const struct rwkv_context * ctx, const uint32_t n_gpu_layers);
// Evaluates the model for a single token. // Evaluates the model for a single token.
// Returns false on any error. Error messages would be printed to stderr. // Returns false on any error. Error messages would be printed to stderr.

View File

@ -32,11 +32,14 @@ class RWKVModel:
assert os.path.isfile(model_path), f'{model_path} is not a file' assert os.path.isfile(model_path), f'{model_path} is not a file'
assert thread_count > 0, 'Thread count must be positive' assert thread_count > 0, 'Thread count must be positive'
assert gpu_layers_count > 0, 'GPU layers count must be positive' assert gpu_layers_count >= 0, 'GPU layers count must be >= 0'
self._library = shared_library self._library = shared_library
self._ctx = self._library.rwkv_init_from_file(model_path, thread_count, gpu_layers_count) self._ctx = self._library.rwkv_init_from_file(model_path, thread_count)
if gpu_layers_count > 0:
self._library.rwkv_gpu_offload_layers(self._ctx, gpu_layers_count)
self._state_buffer_element_count = self._library.rwkv_get_state_buffer_element_count(self._ctx) self._state_buffer_element_count = self._library.rwkv_get_state_buffer_element_count(self._ctx)
self._logits_buffer_element_count = self._library.rwkv_get_logits_buffer_element_count(self._ctx) self._logits_buffer_element_count = self._library.rwkv_get_logits_buffer_element_count(self._ctx)

View File

@ -37,9 +37,12 @@ class RWKVSharedLibrary:
self.library = ctypes.cdll.LoadLibrary(shared_library_path) self.library = ctypes.cdll.LoadLibrary(shared_library_path)
self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32, ctypes.c_uint32] self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32]
self.library.rwkv_init_from_file.restype = ctypes.c_void_p self.library.rwkv_init_from_file.restype = ctypes.c_void_p
self.library.rwkv_gpu_offload_layers.argtypes = [ctypes.c_void_p, ctypes.c_uint32]
self.library.rwkv_gpu_offload_layers.restype = ctypes.c_bool
self.library.rwkv_eval.argtypes = [ self.library.rwkv_eval.argtypes = [
ctypes.c_void_p, # ctx ctypes.c_void_p, # ctx
ctypes.c_int32, # token ctypes.c_int32, # token
@ -67,7 +70,7 @@ class RWKVSharedLibrary:
self.library.rwkv_get_system_info_string.argtypes = [] self.library.rwkv_get_system_info_string.argtypes = []
self.library.rwkv_get_system_info_string.restype = ctypes.c_char_p self.library.rwkv_get_system_info_string.restype = ctypes.c_char_p
def rwkv_init_from_file(self, model_file_path: str, thread_count: int, gpu_layers_count: int) -> RWKVContext: def rwkv_init_from_file(self, model_file_path: str, thread_count: int) -> RWKVContext:
""" """
Loads the model from a file and prepares it for inference. Loads the model from a file and prepares it for inference.
Throws an exception in case of any error. Error messages would be printed to stderr. Throws an exception in case of any error. Error messages would be printed to stderr.
@ -83,11 +86,23 @@ class RWKVSharedLibrary:
""" """
ptr = self.library.rwkv_init_from_file(model_file_path.encode('utf-8'), ptr = self.library.rwkv_init_from_file(model_file_path.encode('utf-8'),
ctypes.c_uint32(thread_count), ctypes.c_uint32(thread_count))
ctypes.c_uint32(gpu_layers_count))
assert ptr is not None, 'rwkv_init_from_file failed, check stderr' assert ptr is not None, 'rwkv_init_from_file failed, check stderr'
return RWKVContext(ptr) return RWKVContext(ptr)
def rwkv_gpu_offload_layers(self, ctx: RWKVContext, gpu_layers_count: int) -> None:
"""
Offloads specified layers of context onto GPU using cuBLAS, if it is enabled.
If rwkv.cpp was compiled without cuBLAS support, this function is a no-op.
Parameters
----------
gpu_layers_count : int
Count of layers to load onto gpu, must be >= 0, only enabled with cuBLAS.
"""
assert self.library.rwkv_gpu_offload_layers(ctx.ptr, ctypes.c_uint32(gpu_layers_count)), 'rwkv_gpu_offload_layers failed, check stderr'
def rwkv_eval( def rwkv_eval(
self, self,
ctx: RWKVContext, ctx: RWKVContext,

View File

@ -26,9 +26,12 @@
void test_model(const char * model_path, const float * expected_logits, const float max_diff) { void test_model(const char * model_path, const float * expected_logits, const float max_diff) {
fprintf(stderr, "Testing %s\n", model_path); fprintf(stderr, "Testing %s\n", model_path);
struct rwkv_context * model = rwkv_init_from_file(model_path, N_THREADS, N_GPU_LAYERS); struct rwkv_context * model = rwkv_init_from_file(model_path, N_THREADS);
enum rwkv_error_flags error = rwkv_get_last_error(NULL); enum rwkv_error_flags error = rwkv_get_last_error(NULL);
ASSERT(error == 0, "Unexpected error %d", error); ASSERT(error == 0, "Unexpected error %d", error);
#ifdef GGML_USE_CUBLAS
ASSERT(rwkv_gpu_offload_layers(model, N_GPU_LAYERS), "Unexpected error %d", rwkv_get_last_error(model));
#endif
uint32_t n_vocab = rwkv_get_logits_buffer_element_count(model); uint32_t n_vocab = rwkv_get_logits_buffer_element_count(model);