File parsing and memory usage optimization (#74)
* Rework the entire file parsing system prepare for future changes * Estimate memory usage perfectly Removes whatever issue with small models that used to exist * Fix file stream ops on macOS for me this compiles on Windows 11, Ubuntu 20.04, and macOS 10.14 * Fix rwkv.cpp for non-WIN32 MSVC invocations like bindgen-rs * Implement Q8_1 quantization ...and disable the type, because GGML doesn't support the ops required to run inference with it. It's not worth any nasty hacks or workarounds right now, Q8_0 is very very similar if one wants 8-bit quantization. * Completely remove Q8_1 type This type isn't meant to be user-facing in any way so I may as well get rid of it now since it will probably never exist as a data format. * Switch from std::vector to unique array for model layers These don't ever need to be resized * Factor ffn.key.weight height into memory estimate some models have this set weirdly, in various different ways. just give up and record the actual size of it and use that * Make a few more operations inplace ggml doesn't currently expose most of the stuff it supports, so force some things. not 100% sure about this, I don't think the memory savings are that worth it * attempt a perfect upper bound size for the scratch space This should be the largest work_size seen in any model, since it is always larger than any of the other paramters except vocab (which does not participate in the graph work size). * Revert "Make a few more operations inplace" This reverts commit f94d6eb216040ae0ad23d2b9c87fae8349882f89. * Make less calls to fread micro-optimization * Fix memory size estimation for smaller models ggml works with some larger formats internally * print location in all assert macros * remove trailing whitespace * add type_to_string entry for unknown * Simplify quantization a bit * fix cuBLAS compatibility adding n_gpu_layers to rwkv_init_from_file won't work. add an extra function instead * fix quantize * quantize: don't create output file if opening input fails * Rename gpu offload layers might want to avoid branding it with cublas in case we add something like clblast support in the future * Remove old read_int32 and write_int32 functions It's all uints now * Remove static from things * Only call gpu_offload_layers if gpu_layer_count > 0 * Add rwkv_ prefix to all structures * Braces * Functions naming convention * Remove blank line after comment * Capitalize comments * Re-add quantize explanatory comment * Re-add histogram comment * Convert all error messages to uppercase * Make type conversions extern for ffi bindings from other langs * Name the state parts The code in rwkv_eval to initialize the state (when state_in is NULL) was getting very confusing so I just put everything in a struct to name it. * Fnvalid
This commit is contained in:
		
							parent
							
								
									241350fde6
								
							
						
					
					
						commit
						363dfb1a06
					
				
							
								
								
									
										7
									
								
								rwkv.h
								
								
								
								
							
							
						
						
									
										7
									
								
								rwkv.h
								
								
								
								
							|  | @ -83,8 +83,11 @@ extern "C" { | ||||||
|     // Returns NULL on any error. Error messages would be printed to stderr.
 |     // Returns NULL on any error. Error messages would be printed to stderr.
 | ||||||
|     // - model_file_path: path to model file in ggml format.
 |     // - model_file_path: path to model file in ggml format.
 | ||||||
|     // - n_threads: count of threads to use, must be positive.
 |     // - n_threads: count of threads to use, must be positive.
 | ||||||
|     // - n_gpu_layer: count of layers need to load to gpu (only works when cuBLAS is on)
 |     RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads); | ||||||
|     RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads, const uint32_t n_gpu_layers); | 
 | ||||||
|  |     // Offloads specified layers of context onto GPU using cuBLAS, if it is enabled.
 | ||||||
|  |     // If rwkv.cpp was compiled without cuBLAS support, this function is a no-op.
 | ||||||
|  |     RWKV_API bool rwkv_gpu_offload_layers(const struct rwkv_context * ctx, const uint32_t n_gpu_layers); | ||||||
| 
 | 
 | ||||||
|     // Evaluates the model for a single token.
 |     // Evaluates the model for a single token.
 | ||||||
|     // Returns false on any error. Error messages would be printed to stderr.
 |     // Returns false on any error. Error messages would be printed to stderr.
 | ||||||
|  |  | ||||||
|  | @ -32,11 +32,14 @@ class RWKVModel: | ||||||
| 
 | 
 | ||||||
|         assert os.path.isfile(model_path), f'{model_path} is not a file' |         assert os.path.isfile(model_path), f'{model_path} is not a file' | ||||||
|         assert thread_count > 0, 'Thread count must be positive' |         assert thread_count > 0, 'Thread count must be positive' | ||||||
|         assert gpu_layers_count > 0, 'GPU layers count must be positive' |         assert gpu_layers_count >= 0, 'GPU layers count must be >= 0' | ||||||
| 
 | 
 | ||||||
|         self._library = shared_library |         self._library = shared_library | ||||||
| 
 | 
 | ||||||
|         self._ctx = self._library.rwkv_init_from_file(model_path, thread_count, gpu_layers_count) |         self._ctx = self._library.rwkv_init_from_file(model_path, thread_count) | ||||||
|  | 
 | ||||||
|  |         if gpu_layers_count > 0: | ||||||
|  | 	        self._library.rwkv_gpu_offload_layers(self._ctx, gpu_layers_count) | ||||||
| 
 | 
 | ||||||
|         self._state_buffer_element_count = self._library.rwkv_get_state_buffer_element_count(self._ctx) |         self._state_buffer_element_count = self._library.rwkv_get_state_buffer_element_count(self._ctx) | ||||||
|         self._logits_buffer_element_count = self._library.rwkv_get_logits_buffer_element_count(self._ctx) |         self._logits_buffer_element_count = self._library.rwkv_get_logits_buffer_element_count(self._ctx) | ||||||
|  |  | ||||||
|  | @ -37,9 +37,12 @@ class RWKVSharedLibrary: | ||||||
| 
 | 
 | ||||||
|         self.library = ctypes.cdll.LoadLibrary(shared_library_path) |         self.library = ctypes.cdll.LoadLibrary(shared_library_path) | ||||||
| 
 | 
 | ||||||
|         self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32, ctypes.c_uint32] |         self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32] | ||||||
|         self.library.rwkv_init_from_file.restype = ctypes.c_void_p |         self.library.rwkv_init_from_file.restype = ctypes.c_void_p | ||||||
| 
 | 
 | ||||||
|  |         self.library.rwkv_gpu_offload_layers.argtypes = [ctypes.c_void_p, ctypes.c_uint32] | ||||||
|  |         self.library.rwkv_gpu_offload_layers.restype = ctypes.c_bool | ||||||
|  | 
 | ||||||
|         self.library.rwkv_eval.argtypes = [ |         self.library.rwkv_eval.argtypes = [ | ||||||
|             ctypes.c_void_p, # ctx |             ctypes.c_void_p, # ctx | ||||||
|             ctypes.c_int32, # token |             ctypes.c_int32, # token | ||||||
|  | @ -67,7 +70,7 @@ class RWKVSharedLibrary: | ||||||
|         self.library.rwkv_get_system_info_string.argtypes = [] |         self.library.rwkv_get_system_info_string.argtypes = [] | ||||||
|         self.library.rwkv_get_system_info_string.restype = ctypes.c_char_p |         self.library.rwkv_get_system_info_string.restype = ctypes.c_char_p | ||||||
| 
 | 
 | ||||||
|     def rwkv_init_from_file(self, model_file_path: str, thread_count: int, gpu_layers_count: int) -> RWKVContext: |     def rwkv_init_from_file(self, model_file_path: str, thread_count: int) -> RWKVContext: | ||||||
|         """ |         """ | ||||||
|         Loads the model from a file and prepares it for inference. |         Loads the model from a file and prepares it for inference. | ||||||
|         Throws an exception in case of any error. Error messages would be printed to stderr. |         Throws an exception in case of any error. Error messages would be printed to stderr. | ||||||
|  | @ -83,11 +86,23 @@ class RWKVSharedLibrary: | ||||||
|         """ |         """ | ||||||
| 
 | 
 | ||||||
|         ptr = self.library.rwkv_init_from_file(model_file_path.encode('utf-8'), |         ptr = self.library.rwkv_init_from_file(model_file_path.encode('utf-8'), | ||||||
|                                                ctypes.c_uint32(thread_count), |                                                ctypes.c_uint32(thread_count)) | ||||||
|                                                ctypes.c_uint32(gpu_layers_count)) |  | ||||||
|         assert ptr is not None, 'rwkv_init_from_file failed, check stderr' |         assert ptr is not None, 'rwkv_init_from_file failed, check stderr' | ||||||
|         return RWKVContext(ptr) |         return RWKVContext(ptr) | ||||||
| 
 | 
 | ||||||
|  |     def rwkv_gpu_offload_layers(self, ctx: RWKVContext, gpu_layers_count: int) -> None: | ||||||
|  |         """ | ||||||
|  |         Offloads specified layers of context onto GPU using cuBLAS, if it is enabled. | ||||||
|  |         If rwkv.cpp was compiled without cuBLAS support, this function is a no-op. | ||||||
|  | 
 | ||||||
|  |         Parameters | ||||||
|  |         ---------- | ||||||
|  |         gpu_layers_count : int | ||||||
|  |             Count of layers to load onto gpu, must be >= 0, only enabled with cuBLAS. | ||||||
|  |         """ | ||||||
|  | 
 | ||||||
|  |         assert self.library.rwkv_gpu_offload_layers(ctx.ptr, ctypes.c_uint32(gpu_layers_count)), 'rwkv_gpu_offload_layers failed, check stderr' | ||||||
|  | 
 | ||||||
|     def rwkv_eval( |     def rwkv_eval( | ||||||
|             self, |             self, | ||||||
|             ctx: RWKVContext, |             ctx: RWKVContext, | ||||||
|  |  | ||||||
|  | @ -26,9 +26,12 @@ | ||||||
| void test_model(const char * model_path, const float * expected_logits, const float max_diff) { | void test_model(const char * model_path, const float * expected_logits, const float max_diff) { | ||||||
|     fprintf(stderr, "Testing %s\n", model_path); |     fprintf(stderr, "Testing %s\n", model_path); | ||||||
| 
 | 
 | ||||||
|     struct rwkv_context * model = rwkv_init_from_file(model_path, N_THREADS, N_GPU_LAYERS); |     struct rwkv_context * model = rwkv_init_from_file(model_path, N_THREADS); | ||||||
|     enum rwkv_error_flags error = rwkv_get_last_error(NULL); |     enum rwkv_error_flags error = rwkv_get_last_error(NULL); | ||||||
|     ASSERT(error == 0, "Unexpected error %d", error); |     ASSERT(error == 0, "Unexpected error %d", error); | ||||||
|  | #ifdef GGML_USE_CUBLAS | ||||||
|  |     ASSERT(rwkv_gpu_offload_layers(model, N_GPU_LAYERS), "Unexpected error %d", rwkv_get_last_error(model)); | ||||||
|  | #endif | ||||||
| 
 | 
 | ||||||
|     uint32_t n_vocab = rwkv_get_logits_buffer_element_count(model); |     uint32_t n_vocab = rwkv_get_logits_buffer_element_count(model); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue