File parsing and memory usage optimization (#74)

* Rework the entire file parsing system

prepare for future changes

* Estimate memory usage perfectly

Removes whatever issue with small models that used to exist

* Fix file stream ops on macOS

for me this compiles on Windows 11, Ubuntu 20.04, and macOS 10.14

* Fix rwkv.cpp for non-WIN32 MSVC invocations like bindgen-rs

* Implement Q8_1 quantization

...and disable the type, because GGML doesn't support the ops
required to run inference with it.

It's not worth any nasty hacks or workarounds right now, Q8_0 is
very very similar if one wants 8-bit quantization.

* Completely remove Q8_1 type

This type isn't meant to be user-facing in any way so I may as well
get rid of it now since it will probably never exist as a data
format.

* Switch from std::vector to unique array for model layers

These don't ever need to be resized

* Factor ffn.key.weight height into memory estimate

some models have this set weirdly, in various different ways.
just give up and record the actual size of it and use that

* Make a few more operations inplace

ggml doesn't currently expose most of the stuff it supports, so
force some things. not 100% sure about this, I don't think the
memory savings are that worth it

* attempt a perfect upper bound size for the scratch space

This should be the largest work_size seen in any model, since it
is always larger than any of the other paramters except vocab
(which does not participate in the graph work size).

* Revert "Make a few more operations inplace"

This reverts commit f94d6eb216040ae0ad23d2b9c87fae8349882f89.

* Make less calls to fread

micro-optimization

* Fix memory size estimation for smaller models

ggml works with some larger formats internally

* print location in all assert macros

* remove trailing whitespace

* add type_to_string entry for unknown

* Simplify quantization a bit

* fix cuBLAS compatibility

adding n_gpu_layers to rwkv_init_from_file won't work.
add an extra function instead

* fix quantize

* quantize: don't create output file if opening input fails

* Rename gpu offload layers

might want to avoid branding it with cublas in case we add something
like clblast support in the future

* Remove old read_int32 and write_int32 functions

It's all uints now

* Remove static from things

* Only call gpu_offload_layers if gpu_layer_count > 0

* Add rwkv_ prefix to all structures

* Braces

* Functions naming convention

* Remove blank line after comment

* Capitalize comments

* Re-add quantize explanatory comment

* Re-add histogram comment

* Convert all error messages to uppercase

* Make type conversions extern

for ffi bindings from other langs

* Name the state parts

The code in rwkv_eval to initialize the state (when state_in is
NULL) was getting very confusing so I just put everything in a
struct to name it.

* Fnvalid
This commit is contained in:
LoganDark 2023-05-31 04:31:19 -07:00 committed by GitHub
parent 241350fde6
commit 363dfb1a06
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 949 additions and 651 deletions

1558
rwkv.cpp

File diff suppressed because it is too large Load Diff

7
rwkv.h
View File

@ -83,8 +83,11 @@ extern "C" {
// Returns NULL on any error. Error messages would be printed to stderr.
// - model_file_path: path to model file in ggml format.
// - n_threads: count of threads to use, must be positive.
// - n_gpu_layer: count of layers need to load to gpu (only works when cuBLAS is on)
RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads, const uint32_t n_gpu_layers);
RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads);
// Offloads specified layers of context onto GPU using cuBLAS, if it is enabled.
// If rwkv.cpp was compiled without cuBLAS support, this function is a no-op.
RWKV_API bool rwkv_gpu_offload_layers(const struct rwkv_context * ctx, const uint32_t n_gpu_layers);
// Evaluates the model for a single token.
// Returns false on any error. Error messages would be printed to stderr.

View File

@ -32,11 +32,14 @@ class RWKVModel:
assert os.path.isfile(model_path), f'{model_path} is not a file'
assert thread_count > 0, 'Thread count must be positive'
assert gpu_layers_count > 0, 'GPU layers count must be positive'
assert gpu_layers_count >= 0, 'GPU layers count must be >= 0'
self._library = shared_library
self._ctx = self._library.rwkv_init_from_file(model_path, thread_count, gpu_layers_count)
self._ctx = self._library.rwkv_init_from_file(model_path, thread_count)
if gpu_layers_count > 0:
self._library.rwkv_gpu_offload_layers(self._ctx, gpu_layers_count)
self._state_buffer_element_count = self._library.rwkv_get_state_buffer_element_count(self._ctx)
self._logits_buffer_element_count = self._library.rwkv_get_logits_buffer_element_count(self._ctx)

View File

@ -37,9 +37,12 @@ class RWKVSharedLibrary:
self.library = ctypes.cdll.LoadLibrary(shared_library_path)
self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32, ctypes.c_uint32]
self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32]
self.library.rwkv_init_from_file.restype = ctypes.c_void_p
self.library.rwkv_gpu_offload_layers.argtypes = [ctypes.c_void_p, ctypes.c_uint32]
self.library.rwkv_gpu_offload_layers.restype = ctypes.c_bool
self.library.rwkv_eval.argtypes = [
ctypes.c_void_p, # ctx
ctypes.c_int32, # token
@ -67,7 +70,7 @@ class RWKVSharedLibrary:
self.library.rwkv_get_system_info_string.argtypes = []
self.library.rwkv_get_system_info_string.restype = ctypes.c_char_p
def rwkv_init_from_file(self, model_file_path: str, thread_count: int, gpu_layers_count: int) -> RWKVContext:
def rwkv_init_from_file(self, model_file_path: str, thread_count: int) -> RWKVContext:
"""
Loads the model from a file and prepares it for inference.
Throws an exception in case of any error. Error messages would be printed to stderr.
@ -83,11 +86,23 @@ class RWKVSharedLibrary:
"""
ptr = self.library.rwkv_init_from_file(model_file_path.encode('utf-8'),
ctypes.c_uint32(thread_count),
ctypes.c_uint32(gpu_layers_count))
ctypes.c_uint32(thread_count))
assert ptr is not None, 'rwkv_init_from_file failed, check stderr'
return RWKVContext(ptr)
def rwkv_gpu_offload_layers(self, ctx: RWKVContext, gpu_layers_count: int) -> None:
"""
Offloads specified layers of context onto GPU using cuBLAS, if it is enabled.
If rwkv.cpp was compiled without cuBLAS support, this function is a no-op.
Parameters
----------
gpu_layers_count : int
Count of layers to load onto gpu, must be >= 0, only enabled with cuBLAS.
"""
assert self.library.rwkv_gpu_offload_layers(ctx.ptr, ctypes.c_uint32(gpu_layers_count)), 'rwkv_gpu_offload_layers failed, check stderr'
def rwkv_eval(
self,
ctx: RWKVContext,

View File

@ -26,9 +26,12 @@
void test_model(const char * model_path, const float * expected_logits, const float max_diff) {
fprintf(stderr, "Testing %s\n", model_path);
struct rwkv_context * model = rwkv_init_from_file(model_path, N_THREADS, N_GPU_LAYERS);
struct rwkv_context * model = rwkv_init_from_file(model_path, N_THREADS);
enum rwkv_error_flags error = rwkv_get_last_error(NULL);
ASSERT(error == 0, "Unexpected error %d", error);
#ifdef GGML_USE_CUBLAS
ASSERT(rwkv_gpu_offload_layers(model, N_GPU_LAYERS), "Unexpected error %d", rwkv_get_last_error(model));
#endif
uint32_t n_vocab = rwkv_get_logits_buffer_element_count(model);