File parsing and memory usage optimization (#74)
* Rework the entire file parsing system prepare for future changes * Estimate memory usage perfectly Removes whatever issue with small models that used to exist * Fix file stream ops on macOS for me this compiles on Windows 11, Ubuntu 20.04, and macOS 10.14 * Fix rwkv.cpp for non-WIN32 MSVC invocations like bindgen-rs * Implement Q8_1 quantization ...and disable the type, because GGML doesn't support the ops required to run inference with it. It's not worth any nasty hacks or workarounds right now, Q8_0 is very very similar if one wants 8-bit quantization. * Completely remove Q8_1 type This type isn't meant to be user-facing in any way so I may as well get rid of it now since it will probably never exist as a data format. * Switch from std::vector to unique array for model layers These don't ever need to be resized * Factor ffn.key.weight height into memory estimate some models have this set weirdly, in various different ways. just give up and record the actual size of it and use that * Make a few more operations inplace ggml doesn't currently expose most of the stuff it supports, so force some things. not 100% sure about this, I don't think the memory savings are that worth it * attempt a perfect upper bound size for the scratch space This should be the largest work_size seen in any model, since it is always larger than any of the other paramters except vocab (which does not participate in the graph work size). * Revert "Make a few more operations inplace" This reverts commit f94d6eb216040ae0ad23d2b9c87fae8349882f89. * Make less calls to fread micro-optimization * Fix memory size estimation for smaller models ggml works with some larger formats internally * print location in all assert macros * remove trailing whitespace * add type_to_string entry for unknown * Simplify quantization a bit * fix cuBLAS compatibility adding n_gpu_layers to rwkv_init_from_file won't work. add an extra function instead * fix quantize * quantize: don't create output file if opening input fails * Rename gpu offload layers might want to avoid branding it with cublas in case we add something like clblast support in the future * Remove old read_int32 and write_int32 functions It's all uints now * Remove static from things * Only call gpu_offload_layers if gpu_layer_count > 0 * Add rwkv_ prefix to all structures * Braces * Functions naming convention * Remove blank line after comment * Capitalize comments * Re-add quantize explanatory comment * Re-add histogram comment * Convert all error messages to uppercase * Make type conversions extern for ffi bindings from other langs * Name the state parts The code in rwkv_eval to initialize the state (when state_in is NULL) was getting very confusing so I just put everything in a struct to name it. * Fnvalid
This commit is contained in:
		
							parent
							
								
									241350fde6
								
							
						
					
					
						commit
						363dfb1a06
					
				
							
								
								
									
										7
									
								
								rwkv.h
								
								
								
								
							
							
						
						
									
										7
									
								
								rwkv.h
								
								
								
								
							|  | @ -83,8 +83,11 @@ extern "C" { | |||
|     // Returns NULL on any error. Error messages would be printed to stderr.
 | ||||
|     // - model_file_path: path to model file in ggml format.
 | ||||
|     // - n_threads: count of threads to use, must be positive.
 | ||||
|     // - n_gpu_layer: count of layers need to load to gpu (only works when cuBLAS is on)
 | ||||
|     RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads, const uint32_t n_gpu_layers); | ||||
|     RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads); | ||||
| 
 | ||||
|     // Offloads specified layers of context onto GPU using cuBLAS, if it is enabled.
 | ||||
|     // If rwkv.cpp was compiled without cuBLAS support, this function is a no-op.
 | ||||
|     RWKV_API bool rwkv_gpu_offload_layers(const struct rwkv_context * ctx, const uint32_t n_gpu_layers); | ||||
| 
 | ||||
|     // Evaluates the model for a single token.
 | ||||
|     // Returns false on any error. Error messages would be printed to stderr.
 | ||||
|  |  | |||
|  | @ -32,11 +32,14 @@ class RWKVModel: | |||
| 
 | ||||
|         assert os.path.isfile(model_path), f'{model_path} is not a file' | ||||
|         assert thread_count > 0, 'Thread count must be positive' | ||||
|         assert gpu_layers_count > 0, 'GPU layers count must be positive' | ||||
|         assert gpu_layers_count >= 0, 'GPU layers count must be >= 0' | ||||
| 
 | ||||
|         self._library = shared_library | ||||
| 
 | ||||
|         self._ctx = self._library.rwkv_init_from_file(model_path, thread_count, gpu_layers_count) | ||||
|         self._ctx = self._library.rwkv_init_from_file(model_path, thread_count) | ||||
| 
 | ||||
|         if gpu_layers_count > 0: | ||||
| 	        self._library.rwkv_gpu_offload_layers(self._ctx, gpu_layers_count) | ||||
| 
 | ||||
|         self._state_buffer_element_count = self._library.rwkv_get_state_buffer_element_count(self._ctx) | ||||
|         self._logits_buffer_element_count = self._library.rwkv_get_logits_buffer_element_count(self._ctx) | ||||
|  |  | |||
|  | @ -37,9 +37,12 @@ class RWKVSharedLibrary: | |||
| 
 | ||||
|         self.library = ctypes.cdll.LoadLibrary(shared_library_path) | ||||
| 
 | ||||
|         self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32, ctypes.c_uint32] | ||||
|         self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32] | ||||
|         self.library.rwkv_init_from_file.restype = ctypes.c_void_p | ||||
| 
 | ||||
|         self.library.rwkv_gpu_offload_layers.argtypes = [ctypes.c_void_p, ctypes.c_uint32] | ||||
|         self.library.rwkv_gpu_offload_layers.restype = ctypes.c_bool | ||||
| 
 | ||||
|         self.library.rwkv_eval.argtypes = [ | ||||
|             ctypes.c_void_p, # ctx | ||||
|             ctypes.c_int32, # token | ||||
|  | @ -67,7 +70,7 @@ class RWKVSharedLibrary: | |||
|         self.library.rwkv_get_system_info_string.argtypes = [] | ||||
|         self.library.rwkv_get_system_info_string.restype = ctypes.c_char_p | ||||
| 
 | ||||
|     def rwkv_init_from_file(self, model_file_path: str, thread_count: int, gpu_layers_count: int) -> RWKVContext: | ||||
|     def rwkv_init_from_file(self, model_file_path: str, thread_count: int) -> RWKVContext: | ||||
|         """ | ||||
|         Loads the model from a file and prepares it for inference. | ||||
|         Throws an exception in case of any error. Error messages would be printed to stderr. | ||||
|  | @ -83,11 +86,23 @@ class RWKVSharedLibrary: | |||
|         """ | ||||
| 
 | ||||
|         ptr = self.library.rwkv_init_from_file(model_file_path.encode('utf-8'), | ||||
|                                                ctypes.c_uint32(thread_count), | ||||
|                                                ctypes.c_uint32(gpu_layers_count)) | ||||
|                                                ctypes.c_uint32(thread_count)) | ||||
|         assert ptr is not None, 'rwkv_init_from_file failed, check stderr' | ||||
|         return RWKVContext(ptr) | ||||
| 
 | ||||
|     def rwkv_gpu_offload_layers(self, ctx: RWKVContext, gpu_layers_count: int) -> None: | ||||
|         """ | ||||
|         Offloads specified layers of context onto GPU using cuBLAS, if it is enabled. | ||||
|         If rwkv.cpp was compiled without cuBLAS support, this function is a no-op. | ||||
| 
 | ||||
|         Parameters | ||||
|         ---------- | ||||
|         gpu_layers_count : int | ||||
|             Count of layers to load onto gpu, must be >= 0, only enabled with cuBLAS. | ||||
|         """ | ||||
| 
 | ||||
|         assert self.library.rwkv_gpu_offload_layers(ctx.ptr, ctypes.c_uint32(gpu_layers_count)), 'rwkv_gpu_offload_layers failed, check stderr' | ||||
| 
 | ||||
|     def rwkv_eval( | ||||
|             self, | ||||
|             ctx: RWKVContext, | ||||
|  |  | |||
|  | @ -26,9 +26,12 @@ | |||
| void test_model(const char * model_path, const float * expected_logits, const float max_diff) { | ||||
|     fprintf(stderr, "Testing %s\n", model_path); | ||||
| 
 | ||||
|     struct rwkv_context * model = rwkv_init_from_file(model_path, N_THREADS, N_GPU_LAYERS); | ||||
|     struct rwkv_context * model = rwkv_init_from_file(model_path, N_THREADS); | ||||
|     enum rwkv_error_flags error = rwkv_get_last_error(NULL); | ||||
|     ASSERT(error == 0, "Unexpected error %d", error); | ||||
| #ifdef GGML_USE_CUBLAS | ||||
|     ASSERT(rwkv_gpu_offload_layers(model, N_GPU_LAYERS), "Unexpected error %d", rwkv_get_last_error(model)); | ||||
| #endif | ||||
| 
 | ||||
|     uint32_t n_vocab = rwkv_get_logits_buffer_element_count(model); | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue