diff --git a/README.md b/README.md index 29e276d..9a1f0da 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Inference code runs, but outputs all `NaN`s in logits, most probably due to miss 2. Validate states and logits against [reference implementation](https://github.com/BlinkDL/ChatRWKV/blob/main/RWKV_in_150_lines.py) by creating a testing script 3. Heavily refactor code; optimize where possible 4. Make FP16 inference work -5. Create fancy interface with sockets/shared memory/pipes/etc. +5. Create fancy interface with sockets/shared memory/pipes/something else 6. Create Python wrapper with sampling and simple chat interface 7. Write a good `README.md` and publish links to this repo 8. Make INT4 inference work @@ -27,7 +27,7 @@ This repo is based on the [llama.cpp repo](https://github.com/ggerganov/llama.cp - `./rwkv`: directory containing Python scripts - `./examples/main_rwkw`: directory containing script that loads and infers RWKV model -Please do not change files in other directories -- this will make pulling recent changes easier. +Please do not change files in other directories — this will make pulling recent changes easier. ## How to use @@ -46,7 +46,7 @@ cmake . Download an RWKV model from [Huggingface](https://huggingface.co/BlinkDL) and convert it into `ggml` format: ```commandline -python convert_pytorch_rwkv_to_ggml.py C:\RWKV-4-Pile-169M-20220807-8023.pth C:\rwkv.cpp-169M.bin float32 +python rwkv\convert_pytorch_rwkv_to_ggml.py C:\RWKV-4-Pile-169M-20220807-8023.pth C:\rwkv.cpp-169M.bin float32 ``` Compile and run the script: diff --git a/examples/main_rwkv/main_rwkv.cpp b/examples/main_rwkv/main_rwkv.cpp index 9288e6f..9f81e1a 100644 --- a/examples/main_rwkv/main_rwkv.cpp +++ b/examples/main_rwkv/main_rwkv.cpp @@ -14,6 +14,7 @@ // --- Utilities --- +// Checks that x is not false. If it is false, prints fancy message to stderr and aborts the execution. #define RWKV_ASSERT(x, ...) \ do { \ if (!(x)) { \ @@ -24,6 +25,9 @@ } \ } while (0) +// Formats and prints a message to stderr. Trailing newline is added automatically. +#define RWKV_LOG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while (0) + // TODO Move to ggml, if correct float ggml_get_f32_2d(struct ggml_tensor * tensor, int i, int j) { RWKV_ASSERT(tensor->n_dims == 2, "Not a 2D tensor"); @@ -46,8 +50,8 @@ void print_tensor(struct ggml_tensor * tensor, char * name) { RWKV_ASSERT(x >= 6, "Too small tensor"); - printf( - "1D tensor %s, shape (%d), [%f %f %f ... %f %f %f]\n", + RWKV_LOG( + "1D tensor %s, shape (%d), [%f %f %f ... %f %f %f]", name, x, ggml_get_f32_1d(tensor, 0), @@ -62,8 +66,8 @@ void print_tensor(struct ggml_tensor * tensor, char * name) { int y = tensor->ne[1]; if (y < 6) { - printf( - "2D tensor %s, shape (%d, %d), [[%f %f %f ... %f %f %f]]\n", + RWKV_LOG( + "2D tensor %s, shape (%d, %d), [[%f %f %f ... %f %f %f]]", name, x, y, @@ -75,8 +79,8 @@ void print_tensor(struct ggml_tensor * tensor, char * name) { ggml_get_f32_2d(tensor, x - 1, y - 1) ); } else { - printf( - "2D tensor %s, shape (%d, %d), [[%f %f %f ... ] ... [ ... %f %f %f]]\n", + RWKV_LOG( + "2D tensor %s, shape (%d, %d), [[%f %f %f ... ] ... [ ... %f %f %f]]", name, x, y, @@ -88,26 +92,6 @@ void print_tensor(struct ggml_tensor * tensor, char * name) { ggml_get_f32_2d(tensor, x - 1, y - 1) ); } - } else if (n_dims == 3) { - int x = tensor->ne[0]; - int y = tensor->ne[1]; - int z = tensor->ne[2]; - - RWKV_ASSERT(z >= 6, "Too small tensor"); - - printf( - "3D tensor %s, shape (%d, %d, %d), [[[%f %f %f ...] ... [... %f %f %f]]]\n", - name, - x, - y, - z, - ggml_get_f32_3d(tensor, 0, 0, 0), - ggml_get_f32_3d(tensor, 0, 0, 1), - ggml_get_f32_3d(tensor, 0, 0, 2), - ggml_get_f32_3d(tensor, x - 1, y - 1, z - 3), - ggml_get_f32_3d(tensor, x - 1, y - 1, z - 2), - ggml_get_f32_3d(tensor, x - 1, y - 1, z - 1) - ); } else { RWKV_ASSERT(false, "Unsupported dimension count %d", n_dims); } @@ -116,7 +100,7 @@ void print_tensor(struct ggml_tensor * tensor, char * name) { // Prints tensor name, dimensionality, shape and part of its contents. #define PRINT_TENSOR(x) print_tensor(x, #x) -// Same as above, but additionally computes tensor graph before printing the tensor. +// Same as PRINT_TENSOR, but additionally computes tensor graph before printing the tensor. #define COMPUTE_AND_PRINT_TENSOR(ctx, x) do { compute_graph(ctx, x); print_tensor(x, #x); } while (0) // Computes value of the tensor and all tensors it depends on. @@ -201,7 +185,7 @@ void set_block_parameter(std::unordered_map * // Loads RWKV model metadata and parameters from a file. void load_rwkv_model(ggml_context * ctx, char * file_path, struct rwkv_model * model) { - printf("Loading model from %s\n", file_path); + RWKV_LOG("Loading model from %s", file_path); FILE * file = fopen(file_path, "rb"); RWKV_ASSERT(file != NULL, "Failed to open file %s", file_path); @@ -227,9 +211,9 @@ void load_rwkv_model(ggml_context * ctx, char * file_path, struct rwkv_model * m RWKV_ASSERT(model->data_type == 0, "Data types other than float32 are not yet supported"); // TODO - printf("n_vocab = %d\n", model->n_vocab); - printf("n_embed = %d\n", model->n_embed); - printf("n_layer = %d\n", model->n_layer); + RWKV_LOG("n_vocab = %d", model->n_vocab); + RWKV_LOG("n_embed = %d", model->n_embed); + RWKV_LOG("n_layer = %d", model->n_layer); std::unordered_map parameters; @@ -288,7 +272,7 @@ void load_rwkv_model(ggml_context * ctx, char * file_path, struct rwkv_model * m fclose(file); - printf("Initializing model parameters\n"); + RWKV_LOG("Initializing model parameters"); model->layers.resize(model->n_layer); @@ -372,7 +356,7 @@ int main(int argc, char ** argv) { char * logits_out_path = argv[5]; int32_t token = strtol(token_s, (char **) NULL, 10); - printf("Token index is %d\n", token); + RWKV_LOG("Token index is %d", token); bool create_new_state = strcmp(state_in_path, "") == 0; @@ -399,7 +383,7 @@ int main(int argc, char ** argv) { struct ggml_tensor * state = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, state_element_count); if (create_new_state) { - printf("Creating new state\n"); + RWKV_LOG("Creating new state"); ggml_set_f32(state, 0.0F); // TODO Verify correctness @@ -410,7 +394,7 @@ int main(int argc, char ** argv) { ggml_set_f32(state_part, -1e30F); } } else { - printf("Loading state from %s\n", state_in_path); + RWKV_LOG("Loading state from %s", state_in_path); int32_t state_file_size = state_element_count * 4; FILE * state_in_file = fopen(state_in_path, "rb"); @@ -606,7 +590,7 @@ int main(int argc, char ** argv) { ggml_free(ctx); - printf("OK\n"); + RWKV_LOG("OK"); return 0; } diff --git a/rwkv/convert_pytorch_rwkv_to_ggml.py b/rwkv/convert_pytorch_rwkv_to_ggml.py index 55f70b9..7c1a339 100644 --- a/rwkv/convert_pytorch_rwkv_to_ggml.py +++ b/rwkv/convert_pytorch_rwkv_to_ggml.py @@ -8,7 +8,7 @@ # // All ints and floats are in machine byte order. # // Magic is "ggml" string bytes. # int32 magic = 0x67676d66; -# int32 version; +# int32 version = 100; # int32 n_vocab; # int32 n_embed; # int32 n_layer; @@ -106,6 +106,7 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t 1 if tensor.dtype == torch.float16 else 0 )) + # Note that shape is not reversed here like in llama.cpp! for dim in tensor.shape: out_file.write(struct.pack('=i', dim))