From 38f9d02d522c4d79733e2a92167656b6260b2e54 Mon Sep 17 00:00:00 2001 From: saharNooby Date: Sat, 1 Apr 2023 20:01:06 +0400 Subject: [PATCH] Fix quantization from FP16 --- README.md | 6 +++--- ggml.c | 38 +++++++++++++++++++++++++++++++++++--- rwkv.cpp | 1 + 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index b0ea4ea..6cc84b4 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,12 @@ This is a port of [BlinkDL/RWKV-LM](https://github.com/BlinkDL/RWKV-LM) to [ggerganov/ggml](https://github.com/ggerganov/ggml). The end goal is to allow 4-bit quanized inference on CPU. -**WORK IN PROGRESS!** **Status**: There is a Python wrapper, FP32 and FP16 inference work correctly. Currently, I'm working on INT4 quantization support. +**WORK IN PROGRESS!** **Status**: FP32, FP16 and INT4 inference work. INT4 gives not so good quality, need to properly measure and compare perplexity. ## Plan -1. Make INT4 inference work -2. Create Python script with sampling and simple chat interface +1. Create Python script with sampling and simple chat interface +2. Measure performance and quality of different model sizes and data types 3. Clean up the repo (remove llama related files and mentions) 4. Write a good `README.md` and publish links to this repo 5. Create pull request to main `ggml` repo with all improvements made here diff --git a/ggml.c b/ggml.c index a7d932f..38758fb 100644 --- a/ggml.c +++ b/ggml.c @@ -326,10 +326,42 @@ static float table_f32_f16[1 << 16]; // This is also true for POWER9. #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) +// https://gist.github.com/rygorous/2144712 +// Public domain, by Fabian "ryg" Giesen +inline static float ggml_half_to_float(uint16_t value) { + union FP32 { + uint32_t u; + float f; + }; + + const union FP32 magic = { (254UL - 15UL) << 23 }; + const union FP32 was_inf_nan = { (127UL + 16UL) << 23 }; + + union FP32 out; + + // Exponent/mantissa bits + out.u = (value & 0x7FFFU) << 13; + // Exponent adjust + out.f *= magic.f; + + // Make sure Inf/NaN survive + if (out.f >= was_inf_nan.f) { + out.u |= 255UL << 23; + } + + // Sign bit + out.u |= (value & 0x8000UL) << 16; + + return out.f; +} + inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { - uint16_t s; - memcpy(&s, &f, sizeof(uint16_t)); - return table_f32_f16[s]; + // For some reason, lookup table does not work on my machine: + // - Windows SDK version 10.0.19041.0 + // - CMAKE_SYSTEM_PROCESSOR: AMD64 + // Replaced lookup with some conversion code found online. + // TODO This must be properly debugged and fixed + return ggml_half_to_float(f); } #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) diff --git a/rwkv.cpp b/rwkv.cpp index 7110057..8f36f88 100644 --- a/rwkv.cpp +++ b/rwkv.cpp @@ -706,6 +706,7 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode work.resize(nelements); // for quantization size_t cur_size = 0; + // This is a histogramm of some values. If it shows single 1.0, then all 0.0, something went very wrong! std::vector hist_cur(1 << 4, 0); switch (type) {