File parsing and memory usage optimization (#74)

* Rework the entire file parsing system prepare for future changes * Estimate memory usage perfectly Removes whatever issue with small models that used to exist * Fix file stream ops on macOS for me this compiles on Windows 11, Ubuntu 20.04, and macOS 10.14 * Fix rwkv.cpp for non-WIN32 MSVC invocations like bindgen-rs * Implement Q8_1 quantization ...and disable the type, because GGML doesn't support the ops required to run inference with it. It's not worth any nasty hacks or workarounds right now, Q8_0 is very very similar if one wants 8-bit quantization. * Completely remove Q8_1 type This type isn't meant to be user-facing in any way so I may as well get rid of it now since it will probably never exist as a data format. * Switch from std::vector to unique array for model layers These don't ever need to be resized * Factor ffn.key.weight height into memory estimate some models have this set weirdly, in various different ways. just give up and record the actual size of it and use that * Make a few more operations inplace ggml doesn't currently expose most of the stuff it supports, so force some things. not 100% sure about this, I don't think the memory savings are that worth it * attempt a perfect upper bound size for the scratch space This should be the largest work_size seen in any model, since it is always larger than any of the other paramters except vocab (which does not participate in the graph work size). * Revert "Make a few more operations inplace" This reverts commit f94d6eb216040ae0ad23d2b9c87fae8349882f89. * Make less calls to fread micro-optimization * Fix memory size estimation for smaller models ggml works with some larger formats internally * print location in all assert macros * remove trailing whitespace * add type_to_string entry for unknown * Simplify quantization a bit * fix cuBLAS compatibility adding n_gpu_layers to rwkv_init_from_file won't work. add an extra function instead * fix quantize * quantize: don't create output file if opening input fails * Rename gpu offload layers might want to avoid branding it with cublas in case we add something like clblast support in the future * Remove old read_int32 and write_int32 functions It's all uints now * Remove static from things * Only call gpu_offload_layers if gpu_layer_count > 0 * Add rwkv_ prefix to all structures * Braces * Functions naming convention * Remove blank line after comment * Capitalize comments * Re-add quantize explanatory comment * Re-add histogram comment * Convert all error messages to uppercase * Make type conversions extern for ffi bindings from other langs * Name the state parts The code in rwkv_eval to initialize the state (when state_in is NULL) was getting very confusing so I just put everything in a struct to name it. * Fnvalid
2023-05-31 04:31:19 -07:00 · 2023-05-31 04:31:19 -07:00 · 363dfb1a06
parent 241350fde6
commit 363dfb1a06
5 changed files with 949 additions and 651 deletions
--- a/rwkv.cpp
+++ b/rwkv.cpp
--- a/rwkv.h
+++ b/rwkv.h
@ -83,8 +83,11 @@ extern "C" {
    // Returns NULL on any error. Error messages would be printed to stderr.
    // - model_file_path: path to model file in ggml format.
    // - n_threads: count of threads to use, must be positive.
-    // - n_gpu_layer: count of layers need to load to gpu (only works when cuBLAS is on)
-    RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads, const uint32_t n_gpu_layers);
+    RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads);
+
+    // Offloads specified layers of context onto GPU using cuBLAS, if it is enabled.
+    // If rwkv.cpp was compiled without cuBLAS support, this function is a no-op.
+    RWKV_API bool rwkv_gpu_offload_layers(const struct rwkv_context * ctx, const uint32_t n_gpu_layers);

    // Evaluates the model for a single token.
    // Returns false on any error. Error messages would be printed to stderr.
--- a/rwkv/rwkv_cpp_model.py
+++ b/rwkv/rwkv_cpp_model.py
@ -32,11 +32,14 @@ class RWKVModel:

        assert os.path.isfile(model_path), f'{model_path} is not a file'
        assert thread_count > 0, 'Thread count must be positive'
-        assert gpu_layers_count > 0, 'GPU layers count must be positive'
+        assert gpu_layers_count >= 0, 'GPU layers count must be >= 0'

        self._library = shared_library

-        self._ctx = self._library.rwkv_init_from_file(model_path, thread_count, gpu_layers_count)
+        self._ctx = self._library.rwkv_init_from_file(model_path, thread_count)
+
+        if gpu_layers_count > 0:
+	        self._library.rwkv_gpu_offload_layers(self._ctx, gpu_layers_count)

        self._state_buffer_element_count = self._library.rwkv_get_state_buffer_element_count(self._ctx)
        self._logits_buffer_element_count = self._library.rwkv_get_logits_buffer_element_count(self._ctx)
--- a/rwkv/rwkv_cpp_shared_library.py
+++ b/rwkv/rwkv_cpp_shared_library.py
@ -37,9 +37,12 @@ class RWKVSharedLibrary:

        self.library = ctypes.cdll.LoadLibrary(shared_library_path)

-        self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32, ctypes.c_uint32]
+        self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32]
        self.library.rwkv_init_from_file.restype = ctypes.c_void_p

+        self.library.rwkv_gpu_offload_layers.argtypes = [ctypes.c_void_p, ctypes.c_uint32]
+        self.library.rwkv_gpu_offload_layers.restype = ctypes.c_bool
+
        self.library.rwkv_eval.argtypes = [
            ctypes.c_void_p, # ctx
            ctypes.c_int32, # token
@ -67,7 +70,7 @@ class RWKVSharedLibrary:
        self.library.rwkv_get_system_info_string.argtypes = []
        self.library.rwkv_get_system_info_string.restype = ctypes.c_char_p

-    def rwkv_init_from_file(self, model_file_path: str, thread_count: int, gpu_layers_count: int) -> RWKVContext:
+    def rwkv_init_from_file(self, model_file_path: str, thread_count: int) -> RWKVContext:
        """
        Loads the model from a file and prepares it for inference.
        Throws an exception in case of any error. Error messages would be printed to stderr.
@ -83,11 +86,23 @@ class RWKVSharedLibrary:
        """

        ptr = self.library.rwkv_init_from_file(model_file_path.encode('utf-8'),
-                                               ctypes.c_uint32(thread_count),
-                                               ctypes.c_uint32(gpu_layers_count))
+                                               ctypes.c_uint32(thread_count))
        assert ptr is not None, 'rwkv_init_from_file failed, check stderr'
        return RWKVContext(ptr)

+    def rwkv_gpu_offload_layers(self, ctx: RWKVContext, gpu_layers_count: int) -> None:
+        """
+        Offloads specified layers of context onto GPU using cuBLAS, if it is enabled.
+        If rwkv.cpp was compiled without cuBLAS support, this function is a no-op.
+
+        Parameters
+        ----------
+        gpu_layers_count : int
+            Count of layers to load onto gpu, must be >= 0, only enabled with cuBLAS.
+        """
+
+        assert self.library.rwkv_gpu_offload_layers(ctx.ptr, ctypes.c_uint32(gpu_layers_count)), 'rwkv_gpu_offload_layers failed, check stderr'
+
    def rwkv_eval(
            self,
            ctx: RWKVContext,
--- a/tests/test_tiny_rwkv.c
+++ b/tests/test_tiny_rwkv.c
@ -26,9 +26,12 @@
 void test_model(const char * model_path, const float * expected_logits, const float max_diff) {
    fprintf(stderr, "Testing %s\n", model_path);

-    struct rwkv_context * model = rwkv_init_from_file(model_path, N_THREADS, N_GPU_LAYERS);
+    struct rwkv_context * model = rwkv_init_from_file(model_path, N_THREADS);
    enum rwkv_error_flags error = rwkv_get_last_error(NULL);
    ASSERT(error == 0, "Unexpected error %d", error);
+#ifdef GGML_USE_CUBLAS
+    ASSERT(rwkv_gpu_offload_layers(model, N_GPU_LAYERS), "Unexpected error %d", rwkv_get_last_error(model));
+#endif

    uint32_t n_vocab = rwkv_get_logits_buffer_element_count(model);