diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 07807e2..2049328 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -18,9 +18,49 @@ env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} jobs: + ubuntu-latest-cmake-sanitizer: + runs-on: ubuntu-latest + + continue-on-error: true + + strategy: + matrix: + sanitizer: [ADDRESS, THREAD, UNDEFINED] + build_type: [Debug, Release] + accelerate: [ON, OFF] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + with: + submodules: 'recursive' + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install build-essential + + - name: Build + id: cmake_build + run: | + mkdir build + cd build + cmake .. -DRWKV_SANITIZE_${{ matrix.sanitizer }}=ON -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON -DRWKV_ACCELERATE=${{ matrix.accelerate }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + cmake --build . --config ${{ matrix.build_type }} + + - name: Test + id: cmake_test + run: | + cd build + ctest --verbose + ubuntu-latest-cmake: runs-on: ubuntu-latest + continue-on-error: true + steps: - name: Clone id: checkout @@ -42,6 +82,12 @@ jobs: cmake -DBUILD_SHARED_LIBS=ON .. cmake --build . --config Release + - name: Test + id: cmake_test + run: | + cd build + ctest --verbose + - name: Get commit hash id: commit if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} @@ -71,6 +117,8 @@ jobs: macOS-latest-cmake: runs-on: macOS-latest + continue-on-error: true + steps: - name: Clone id: checkout @@ -86,12 +134,19 @@ jobs: - name: Build id: cmake_build + # FMA disabled because it gives "Illegal instruction" in GitHub Actions runner run: | mkdir build cd build - cmake -DBUILD_SHARED_LIBS=ON -DRWKV_AVX2=OFF .. + cmake -DBUILD_SHARED_LIBS=ON -DRWKV_AVX2=OFF -DRWKV_FMA=OFF .. cmake --build . --config Release + - name: Test + id: cmake_test + run: | + cd build + ctest --verbose + - name: Get commit hash id: commit if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} @@ -121,6 +176,8 @@ jobs: windows-latest-cmake: runs-on: windows-latest + continue-on-error: true + strategy: matrix: include: diff --git a/CMakeLists.txt b/CMakeLists.txt index 318ffd1..7556082 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,15 +10,6 @@ endif() set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) -if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) - set(RWKV_STANDALONE ON) - - # configure project version - # TODO -else() - set(RWKV_STANDALONE OFF) -endif() - if (EMSCRIPTEN) set(BUILD_SHARED_LIBS_DEFAULT OFF) @@ -31,27 +22,25 @@ else() endif() endif() - # # Option list # -# general +# General option(RWKV_STATIC "rwkv: static link libraries" OFF) option(RWKV_NATIVE "rwkv: enable -march=native flag" OFF) option(RWKV_LTO "rwkv: enable link time optimization" OFF) -# debug +# Debug option(RWKV_ALL_WARNINGS "rwkv: enable all compiler warnings" ON) -option(RWKV_ALL_WARNINGS_3RD_PARTY "rwkv: enable all compiler warnings in 3rd party libs" OFF) option(RWKV_GPROF "rwkv: enable gprof" OFF) -# sanitizers +# Sanitizers option(RWKV_SANITIZE_THREAD "rwkv: enable thread sanitizer" OFF) option(RWKV_SANITIZE_ADDRESS "rwkv: enable address sanitizer" OFF) option(RWKV_SANITIZE_UNDEFINED "rwkv: enable undefined sanitizer" OFF) -# instruction set specific +# Instruction set specific option(RWKV_AVX "rwkv: enable AVX" ON) option(RWKV_AVX2 "rwkv: enable AVX2" ON) option(RWKV_AVX512 "rwkv: enable AVX512" OFF) @@ -72,16 +61,19 @@ find_package(Threads REQUIRED) if (NOT MSVC) if (RWKV_SANITIZE_THREAD) + message(STATUS "Using RWKV_SANITIZE_THREAD") add_compile_options(-fsanitize=thread) link_libraries(-fsanitize=thread) endif() if (RWKV_SANITIZE_ADDRESS) + message(STATUS "Using RWKV_SANITIZE_ADDRESS") add_compile_options(-fsanitize=address -fno-omit-frame-pointer) link_libraries(-fsanitize=address) endif() if (RWKV_SANITIZE_UNDEFINED) + message(STATUS "Using RWKV_SANITIZE_UNDEFINED") add_compile_options(-fsanitize=undefined) link_libraries(-fsanitize=undefined) endif() @@ -98,6 +90,7 @@ if (APPLE AND RWKV_ACCELERATE) message(WARNING "Accelerate framework not found") endif() endif() + if (RWKV_OPENBLAS) if (RWKV_STATIC) set(BLA_STATIC ON) @@ -136,7 +129,7 @@ if (RWKV_ALL_WARNINGS) -Wno-unused-function ) else() - # todo : msvc + # TODO [llama.cpp]: msvc endif() add_compile_options( @@ -157,7 +150,7 @@ if (RWKV_LTO) endif() # Architecture specific -# TODO: probably these flags need to be tweaked on some architectures +# TODO [llama.cpp]: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") if (NOT MSVC) @@ -178,12 +171,12 @@ endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") message(STATUS "ARM detected") if (MSVC) - # TODO: arm msvc? + # TODO [llama.cpp]: arm msvc? else() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") add_compile_options(-mcpu=native) endif() - # TODO: armv6,7,8 version specific flags + # TODO [llama.cpp]: armv6,7,8 version specific flags endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") message(STATUS "x86 detected") @@ -214,7 +207,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") endif() endif() else() - # TODO: support PowerPC + # TODO [llama.cpp]: support PowerPC message(STATUS "Unknown architecture") endif() @@ -232,15 +225,16 @@ if (BUILD_SHARED_LIBS) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) endif() -add_library(rwkv - rwkv.cpp - rwkv.h) +add_library(rwkv rwkv.cpp rwkv.h) target_include_directories(rwkv PUBLIC .) -target_compile_features(rwkv PUBLIC cxx_std_11) # don't bump +target_compile_features(rwkv PUBLIC cxx_std_11) # Don't bump target_link_libraries(rwkv PRIVATE ggml ${RWKV_EXTRA_LIBS}) if (BUILD_SHARED_LIBS) set_target_properties(rwkv PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(rwkv PRIVATE RWKV_SHARED RWKV_BUILD) endif() + +enable_testing() +add_subdirectory(tests) diff --git a/LICENSE b/LICENSE index 76f67ef..c3e5ff6 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 Georgi Gerganov +Copyright (c) 2023 saharNooby Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 43cd5c1..3e21ced 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,7 @@ Loading LoRA checkpoints in [Blealtan's format](https://github.com/Blealtan/RWKV **TODO (contributions welcome!)**: 1. Measure latency and perplexity of different model sizes (169M to 14B) and data types (`FP32`, `FP16`, `Q4_0`, `Q4_1`, `Q4_1_O`) -2. Test on Linux (including Colab) and MacOS -3. Make required memory calculation more robust (see [#4](https://github.com/saharNooby/rwkv.cpp/issues/4)) +2. Make required memory calculation more robust (see [#4](https://github.com/saharNooby/rwkv.cpp/issues/4)) ## How to use diff --git a/ggml b/ggml index 538e516..0330904 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 538e516aced0aae5b22cbe7e691169e6957df753 +Subproject commit 03309047d2e65c05ffefbf64c6c4c943e6647c64 diff --git a/rwkv.cpp b/rwkv.cpp index f21e9cb..835d0d8 100644 --- a/rwkv.cpp +++ b/rwkv.cpp @@ -240,6 +240,7 @@ struct rwkv_context * rwkv_init_from_file(const char * file_path, uint32_t n_thr struct ggml_init_params params; params.mem_size = memory_required; params.mem_buffer = NULL; + params.no_alloc = false; struct ggml_context * ctx = ggml_init(params); std::unordered_map parameters; @@ -587,11 +588,12 @@ bool rwkv_eval(struct rwkv_context * ctx, int32_t token, float * state_in, float } void rwkv_free(struct rwkv_context * ctx) { + ctx->model->layers.~vector(); + free(ctx->model); + delete[] ctx->state_parts; ggml_free(ctx->ctx); - - delete ctx->model; - delete ctx->state_parts; - delete ctx; + free(ctx->graph); + free(ctx); } bool rwkv_quantize_model_file(const char * model_file_path_in, const char * model_file_path_out, uint32_t q_type) { @@ -599,7 +601,7 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode // Needed to initialize FP16 lookup table { - struct ggml_init_params params = { 0, NULL }; + struct ggml_init_params params = { 0, NULL, false }; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } @@ -789,7 +791,7 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode printf("original size = %8.2f MB\n", total_size_orig / 1024.0 / 1024.0); printf("quantized size = %8.2f MB\n", total_size_new / 1024.0 / 1024.0); - printf("compression ratio = %8.2f%%\n", 1.0 * total_size_orig / total_size_new); + printf("compression ratio = %8.2f\n", 1.0 * total_size_orig / total_size_new); { int64_t sum_all = 0; diff --git a/rwkv.h b/rwkv.h index 21341bc..ca4444e 100644 --- a/rwkv.h +++ b/rwkv.h @@ -56,7 +56,7 @@ extern "C" { // Returns false on any error. Error messages would be printed to stderr. // - model_file_path_in: path to model file in ggml format, must be either FP32 or FP16. // - model_file_path_out: quantized model will be written here. - // - q_type: set to 2 for GGML_TYPE_Q4_0, set to 3 for GGML_TYPE_Q4_1. + // - q_type: set to 2 for GGML_TYPE_Q4_0, set to 3 for GGML_TYPE_Q4_1, set to 4 for GGML_TYPE_Q4_1_O. RWKV_API bool rwkv_quantize_model_file(const char * model_file_path_in, const char * model_file_path_out, uint32_t q_type); // Returns system information string. diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..2349544 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,15 @@ +function(rwkv_add_test source) + get_filename_component(TEST_TARGET ${source} NAME_WE) + add_executable(${TEST_TARGET} ${source}) + target_link_libraries(${TEST_TARGET} PRIVATE ggml rwkv) + add_test(NAME ${TEST_TARGET} COMMAND $ ${ARGN}) +endfunction() + +file(COPY tiny-rwkv-660K-FP32.bin DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) +file(COPY tiny-rwkv-660K-FP16.bin DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) +file(COPY expected_logits.bin DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + +rwkv_add_test(test_ggml_basics.c) +rwkv_add_test(test_Q4_1_O.c) +rwkv_add_test(test_Q4_1_O_large_matmul.c) +rwkv_add_test(test_tiny_rwkv.c) diff --git a/tests/expected_logits.bin b/tests/expected_logits.bin new file mode 100644 index 0000000..177a208 Binary files /dev/null and b/tests/expected_logits.bin differ diff --git a/tests/test_Q4_1_O.c b/tests/test_Q4_1_O.c new file mode 100644 index 0000000..a22d007 --- /dev/null +++ b/tests/test_Q4_1_O.c @@ -0,0 +1,174 @@ +// Tests that Q4_1_O basics (quantization, dequantization, matmul) work. + +#include "ggml.h" +#include "rwkv.h" + +#include +#include +#include + +#define GET_ELEMENT_F32(tensor, i) (((float *) tensor->data)[i]) + +#define SET_ELEMENT_F32(tensor, i, value) ((float *) tensor->data)[i] = value + +#define ASSERT(x, ...) {\ + if (!(x)) {\ + fprintf(stderr, "*** Assertion failed ***\n");\ + fprintf(stderr, __VA_ARGS__);\ + fprintf(stderr, "\n%s:%d\n", __FILE__, __LINE__);\ + abort();\ + }\ + } + +// --- + +#define QK 32 + +// Copied from ggml.c +typedef struct { + ggml_fp16_t d; + ggml_fp16_t m; + uint16_t outlier_index; + ggml_fp16_t outlier_value; + uint8_t qs[QK / 2]; +} block_q4_1_o; + +int main(int argc, const char ** argv) { + ASSERT(sizeof(block_q4_1_o) == 8 + QK / 2, "Wrong q4_1_o block size/padding"); + + // Needed to initialize FP16 lookup table + { + struct ggml_init_params params = { 0, NULL, false }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } + + fprintf(stderr, "System info: %s\n", rwkv_get_system_info_string()); + + quantize_fns_t quantize_fns = ggml_internal_get_quantize_fn(GGML_TYPE_Q4_1_O); + + float src[QK]; + uint8_t dest[24]; + + // 1..32 + for (int i = 0; i < QK; i++) { + src[i] = (float) (i + 1); + } + + // --- Quantization --- + (quantize_fns.quantize_row_q)(src, dest, QK); + + float delta_result = ggml_fp16_to_fp32(((block_q4_1_o *) dest)->d); + float delta_expected = (src[30] - src[0]) / ((1 << 4) - 1); + ASSERT(delta_result == delta_expected, "%f, %f", delta_result, delta_expected); + + float min_result = ggml_fp16_to_fp32(((block_q4_1_o *) dest)->m); + float min_expected = src[0]; + ASSERT(min_result == min_expected, "%f, %f", min_result, min_expected); + + uint16_t outlier_index = ((block_q4_1_o *) dest)->outlier_index; + uint16_t outlier_index_expected = 31; + ASSERT(outlier_index == outlier_index_expected, "%d, %d", outlier_index, outlier_index_expected); + + float outlier_value_result = ggml_fp16_to_fp32(((block_q4_1_o *) dest)->outlier_value); + float outlier_value_expected = src[31]; + ASSERT(outlier_value_result == outlier_value_expected, "%f, %f", outlier_value_result, outlier_value_expected); + + for (int i = 0; i < QK - 1; i++) { + uint8_t q4_result = (i % 2) ? (dest[sizeof(float) * 2 + i / 2] >> 4) : (dest[sizeof(float) * 2 + i / 2] & 0xF); + uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected); + ASSERT(q4_result == q4_expected, "%d: %d, %d", i, q4_result, q4_expected); + } + + // --- Dequantization --- + float dequantized[QK]; + (quantize_fns.dequantize_row_q)(dest, dequantized, QK); + + for (int i = 0; i < QK; i++) { + float actual = dequantized[i]; + float expected = src[i]; + float diff = fabsf(actual - expected); + // Difference looks huge, but the range is 0..31 -- compared to the range, it is not that huge + ASSERT(diff <= 1.0F, "%d: %f, %f", i, actual, expected); + } + + // --- Matmul --- + struct ggml_init_params params = { + .mem_size = 16 * 1024, + .mem_buffer = NULL, + .no_alloc = false, + }; + + struct ggml_context * ctx = ggml_init(params); + + struct ggml_tensor * mat = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, QK, 4); + + // Note rare outlier values: -88, -83, etc. + float mat_values[QK * 4] = { + -1.371795F, -88.901100F, -0.412088F, -0.486081F, 1.280220F, -1.067033F, 1.371795F, 1.099267F, 1.079487F, -0.204029F, 1.237729F, -0.563736F, + -0.633333F, 0.700000F, 0.211355F, 0.510989F, -0.981319F, -0.456777F, 0.011355F, 0.911722F, -0.976191F, 0.078022F, -0.757143F, -0.744689F, + -0.768865F, 0.656777F, 0.141026F, -0.038462F, 1.023810F, 1.221612F, -0.393773F, 1.135165F, -1.341758F, -83.113556F, 1.291209F, 0.313187F, + 1.032601F, -0.401099F, 1.482418F, 0.823077F, 0.619414F, -0.583516F, 0.527106F, 1.489011F, 1.327839F, 0.846520F, -1.437729F, 0.461172F, + 1.031136F, 0.293407F, 0.284615F, -1.102198F, -1.481685F, 0.602564F, -0.480952F, -0.745421F, -1.376190F, -1.319780F, 1.338828F, -1.062637F, + 1.266300F, 0.360073F, 1.472894F, 1.063370F, -0.833333F, 49.047626F, -1.229670F, 1.079487F, -0.004762F, -0.696337F, -0.541758F, 0.993773F, + -1.323443F, 0.908059F, -1.059707F, 0.965201F, -0.376923F, 1.158608F, -1.100000F, -1.002564F, -0.355678F, 1.157143F, 0.450916F, -0.497802F, + 1.270696F, 0.028205F, 1.075092F, 1.462637F, 0.252381F, -0.579121F, -0.880220F, -0.041392F, -1.017949F, -0.754945F, 0.582784F, -1.193773F, + -1.411355F, 122.014656F, -1.053114F, -0.949084F, 0.448718F, 0.209890F, 0.815751F, 0.071429F, -0.125641F, -0.600366F, -0.914652F, -0.956410F, + -0.278755F, 0.235531F, -0.573260F, -1.484615F, -0.327839F, -0.297070F, -1.195238F, -1.160073F, 0.932967F, -0.606960F, 0.798901F, 0.212088F, + 0.113187F, -0.116117F, -0.532967F, 0.077289F, 0.016484F, 1.352747F, -1.487546F, -1.363736F + }; + + for (int i = 0; i < QK * 4; i++) { + SET_ELEMENT_F32(mat, i, mat_values[i]); + } + + struct ggml_tensor * quantized_mat = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_1_O, QK, 4); + + int64_t histogram[16]; + + ggml_quantize_q4_1_o(mat->data, quantized_mat->data, QK * 4, QK, histogram); + + struct ggml_tensor * vec = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, QK); + + float vec_values[] = { + -0.578388F, -0.770330F, -0.183516F, 0.264103F, 0.585714F, -0.226740F, 1.319048F, 0.652381F, + -1.161538F, 0.428205F, -0.907326F, -0.837729F, 0.673626F, 0.248718F, 0.392308F, -0.225275F, + 0.910989F, 0.483150F, -0.669963F, -0.412088F, 0.954945F, 0.826007F, 0.113919F, 0.095604F, + -1.042125F, -1.094872F, 0.589377F, -0.426007F, 0.669231F, -0.243590F, -0.179121F, 0.325641F + }; + + for (int i = 0; i < QK; i++) { + SET_ELEMENT_F32(vec, i, vec_values[i]); + } + + struct ggml_tensor * expected_result = ggml_mul_mat(ctx, mat, vec); + struct ggml_tensor * quantized_result = ggml_mul_mat(ctx, quantized_mat, vec); + + struct ggml_cgraph graph = ggml_build_forward(expected_result); + ggml_build_forward_expand(&graph, quantized_result); + graph.n_threads = 2; + ggml_graph_compute(ctx, &graph); + + float diff_sum = 0.0F; + + for (int i = 0; i < 4; i++) { + fprintf( + stderr, + "[%d] expected %f, actual %f\n", + i, + GET_ELEMENT_F32(expected_result, i), + GET_ELEMENT_F32(quantized_result, i) + ); + + diff_sum += fabsf(GET_ELEMENT_F32(expected_result, i) - GET_ELEMENT_F32(quantized_result, i)); + } + + float diff_average = diff_sum / 4; + + // If Q4_1_O format works correctly, difference should be this or lower + ASSERT(diff_average <= 0.112F, "Unexpected average difference value %f", diff_average); + + ggml_free(ctx); + + return 0; +} diff --git a/tests/test_Q4_1_O_large_matmul.c b/tests/test_Q4_1_O_large_matmul.c new file mode 100644 index 0000000..0bd4e27 --- /dev/null +++ b/tests/test_Q4_1_O_large_matmul.c @@ -0,0 +1,86 @@ +// Tests that Q4_1_O matmul on a large matrix works (does not crash, etc.) + +#include "ggml.h" +#include "rwkv.h" + +#include +#include +#include + +#define GET_ELEMENT_F32(tensor, i) (((float *) tensor->data)[i]) + +#define SET_ELEMENT_F32(tensor, i, value) ((float *) tensor->data)[i] = value + +#define ASSERT(x, ...) {\ + if (!(x)) {\ + fprintf(stderr, "*** Assertion failed ***\n");\ + fprintf(stderr, __VA_ARGS__);\ + fprintf(stderr, "\n%s:%d\n", __FILE__, __LINE__);\ + abort();\ + }\ + } + +#define RANDOM_FLOAT() (((rand() & 0xFFF) / ((float) 0xFFF) - 0.5F) * 3.0F) + +// --- + +#define QK 32 +#define MATRIX_SIZE 1024 + +int main(int argc, const char ** argv) { + srand(42); + + struct ggml_init_params params = { + .mem_size = 8 * 1024 * 1024, + .mem_buffer = NULL, + .no_alloc = false, + }; + + struct ggml_context * ctx = ggml_init(params); + + struct ggml_tensor * mat = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, MATRIX_SIZE, MATRIX_SIZE); + + for (int i = 0; i < MATRIX_SIZE * MATRIX_SIZE; i++) { + SET_ELEMENT_F32(mat, i, RANDOM_FLOAT()); + } + + // Add some outliers + for (int i = 0; i < MATRIX_SIZE; i++) { + SET_ELEMENT_F32(mat, i * MATRIX_SIZE + 1, RANDOM_FLOAT() * 100.0F); + } + + struct ggml_tensor * quantized_mat = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_1_O, MATRIX_SIZE, MATRIX_SIZE); + + int64_t histogram[16]; + + ggml_quantize_q4_1_o(mat->data, quantized_mat->data, MATRIX_SIZE * MATRIX_SIZE, QK, histogram); + + struct ggml_tensor * vec = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, MATRIX_SIZE); + + for (int i = 0; i < MATRIX_SIZE; i++) { + SET_ELEMENT_F32(vec, i, RANDOM_FLOAT()); + } + + struct ggml_tensor * expected_result = ggml_mul_mat(ctx, mat, vec); + struct ggml_tensor * quantized_result = ggml_mul_mat(ctx, quantized_mat, vec); + + struct ggml_cgraph graph = ggml_build_forward(expected_result); + ggml_build_forward_expand(&graph, quantized_result); + graph.n_threads = 4; + ggml_graph_compute(ctx, &graph); + + float diff_sum = 0.0F; + + for (int i = 0; i < MATRIX_SIZE; i++) { + diff_sum += fabsf(GET_ELEMENT_F32(expected_result, i) - GET_ELEMENT_F32(quantized_result, i)); + } + + float diff_average = diff_sum / MATRIX_SIZE; + + // More strict test is in test_Q4_1_O.c, here we just do sanity check + ASSERT(diff_average <= 2.0F, "Unexpected average difference value %f", diff_average); + + ggml_free(ctx); + + return 0; +} diff --git a/tests/test_ggml_basics.c b/tests/test_ggml_basics.c new file mode 100644 index 0000000..d14f85a --- /dev/null +++ b/tests/test_ggml_basics.c @@ -0,0 +1,62 @@ +// Tests that ggml basics work. + +#include "ggml.h" + +#include +#include +#include + +#define SET_ELEMENT_F32(tensor, i, value) ((float *) tensor->data)[i] = value + +#define ASSERT(x, ...) {\ + if (!(x)) {\ + fprintf(stderr, "*** Assertion failed ***\n");\ + fprintf(stderr, __VA_ARGS__);\ + fprintf(stderr, "\n%s:%d\n", __FILE__, __LINE__);\ + abort();\ + }\ + } + +#define ASSERT_ELEMENT_F32(tensor, i, expected_value) {\ + float actual = ((float *) tensor->data)[i];\ + ASSERT(fabsf(actual - expected_value) <= 0.0000001F, "At %s[%d]: expected %f, actual %f", #tensor, i, expected_value, actual);\ + } + +int main(int argc, const char ** argv) { + struct ggml_init_params params = { + .mem_size = 16 * 1024, + .mem_buffer = NULL, + .no_alloc = false, + }; + + struct ggml_context * ctx = ggml_init(params); + + struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4); + SET_ELEMENT_F32(x, 0, -10.0F); + SET_ELEMENT_F32(x, 1, 0.0F); + SET_ELEMENT_F32(x, 2, 2.5F); + SET_ELEMENT_F32(x, 3, 5.0F); + + struct ggml_tensor * y = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4); + SET_ELEMENT_F32(y, 0, 1.0F); + SET_ELEMENT_F32(y, 1, 2.0F); + SET_ELEMENT_F32(y, 2, 3.0F); + SET_ELEMENT_F32(y, 3, 4.0F); + + struct ggml_tensor * sum = ggml_add(ctx, x, y); + + struct ggml_cgraph graph = ggml_build_forward(sum); + graph.n_threads = 2; + ggml_graph_compute(ctx, &graph); + + ASSERT_ELEMENT_F32(sum, 0, -9.0F); + ASSERT_ELEMENT_F32(sum, 1, 2.0F); + ASSERT_ELEMENT_F32(sum, 2, 5.5F); + ASSERT_ELEMENT_F32(sum, 3, 9.0F); + + ggml_print_objects(ctx); + + ggml_free(ctx); + + return 0; +} diff --git a/tests/test_tiny_rwkv.c b/tests/test_tiny_rwkv.c new file mode 100644 index 0000000..49c2873 --- /dev/null +++ b/tests/test_tiny_rwkv.c @@ -0,0 +1,107 @@ +// Tests that tiny RWKV outputs expected results in all data types. + +#include "ggml.h" +#include "rwkv.h" + +#include +#include +#include +#include + +#define ASSERT(x, ...) {\ + if (!(x)) {\ + fprintf(stderr, "*** Assertion failed ***\n");\ + fprintf(stderr, __VA_ARGS__);\ + fprintf(stderr, "\n%s:%d\n", __FILE__, __LINE__);\ + abort();\ + }\ + } + +// --- + +#define N_VOCAB 256 +#define N_THREADS 2 + +void test_model(const char * model_path, const float * expected_logits, const float max_diff) { + fprintf(stderr, "Testing %s\n", model_path); + + struct rwkv_context * model = rwkv_init_from_file(model_path, N_THREADS); + + uint32_t n_vocab = rwkv_get_logits_buffer_element_count(model); + + ASSERT(n_vocab == N_VOCAB, "Unexpected n_vocab in the model"); + + float * state = malloc(sizeof(float) * rwkv_get_state_buffer_element_count(model)); + float * logits = malloc(sizeof(float) * n_vocab); + + char * prompt = "\"in"; + + const size_t prompt_length = strlen(prompt); + + for (size_t i = 0; i < prompt_length; i++) { + rwkv_eval(model, prompt[i], i == 0 ? NULL : state, state, logits); + } + + float diff_sum = 0.0F; + + for (uint32_t i = 0; i < n_vocab; i++) { + diff_sum += logits[i] - expected_logits[i]; + } + + fprintf(stderr, "Difference sum: %f\n", diff_sum); + + // When something breaks, difference would be way more than 10 + ASSERT(fabsf(diff_sum) <= fabsf(max_diff) + 0.01F, "Too big difference %f, expected no more than %f", diff_sum, max_diff); + + rwkv_free(model); + + free(state); + free(logits); +} + +int main(int argc, const char ** argv) { + fprintf(stderr, "System info: %s\n", rwkv_get_system_info_string()); + + float * expected_logits = malloc(sizeof(float) * N_VOCAB); + FILE * file = fopen("expected_logits.bin", "rb"); + ASSERT(file != NULL, "Failed to open expected_logits.bin"); + size_t elements_read = fread(expected_logits, sizeof(float), N_VOCAB, file); + ASSERT(elements_read == N_VOCAB, "Failed to read expected_logits.bin, read %zd elements", elements_read); + fclose(file); + + float expected_difference_sum[8] = { + 0.000000F, + -0.005320F, + + -0.501214F, + -1.092427F, + -0.268956F, + + -0.501073F, + -1.103214F, + -0.244590F + }; + + test_model("tiny-rwkv-660K-FP32.bin", expected_logits, expected_difference_sum[0]); + test_model("tiny-rwkv-660K-FP16.bin", expected_logits, expected_difference_sum[1]); + + rwkv_quantize_model_file("tiny-rwkv-660K-FP32.bin", "tiny-rwkv-660K-FP32-Q4_0.bin", 2); + rwkv_quantize_model_file("tiny-rwkv-660K-FP32.bin", "tiny-rwkv-660K-FP32-Q4_1.bin", 3); + rwkv_quantize_model_file("tiny-rwkv-660K-FP32.bin", "tiny-rwkv-660K-FP32-Q4_1_O.bin", 4); + + test_model("tiny-rwkv-660K-FP32-Q4_0.bin", expected_logits, expected_difference_sum[2]); + test_model("tiny-rwkv-660K-FP32-Q4_1.bin", expected_logits, expected_difference_sum[3]); + test_model("tiny-rwkv-660K-FP32-Q4_1_O.bin", expected_logits, expected_difference_sum[4]); + + rwkv_quantize_model_file("tiny-rwkv-660K-FP16.bin", "tiny-rwkv-660K-FP16-Q4_0.bin", 2); + rwkv_quantize_model_file("tiny-rwkv-660K-FP16.bin", "tiny-rwkv-660K-FP16-Q4_1.bin", 3); + rwkv_quantize_model_file("tiny-rwkv-660K-FP16.bin", "tiny-rwkv-660K-FP16-Q4_1_O.bin", 4); + + test_model("tiny-rwkv-660K-FP16-Q4_0.bin", expected_logits, expected_difference_sum[5]); + test_model("tiny-rwkv-660K-FP16-Q4_1.bin", expected_logits, expected_difference_sum[6]); + test_model("tiny-rwkv-660K-FP16-Q4_1_O.bin", expected_logits, expected_difference_sum[7]); + + free(expected_logits); + + return 0; +} diff --git a/tests/tiny-rwkv-660K-FP16.bin b/tests/tiny-rwkv-660K-FP16.bin new file mode 100644 index 0000000..84c0499 Binary files /dev/null and b/tests/tiny-rwkv-660K-FP16.bin differ diff --git a/tests/tiny-rwkv-660K-FP32.bin b/tests/tiny-rwkv-660K-FP32.bin new file mode 100644 index 0000000..69c1e6f Binary files /dev/null and b/tests/tiny-rwkv-660K-FP32.bin differ