From 38f9d02d522c4d79733e2a92167656b6260b2e54 Mon Sep 17 00:00:00 2001
From: saharNooby <saharnooby@protonmail.com>
Date: Sat, 1 Apr 2023 20:01:06 +0400
Subject: [PATCH] Fix quantization from FP16

---
 README.md |  6 +++---
 ggml.c    | 38 +++++++++++++++++++++++++++++++++++---
 rwkv.cpp  |  1 +
 3 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index b0ea4ea..6cc84b4 100644
--- a/README.md
+++ b/README.md
@@ -2,12 +2,12 @@
 
 This is a port of [BlinkDL/RWKV-LM](https://github.com/BlinkDL/RWKV-LM) to [ggerganov/ggml](https://github.com/ggerganov/ggml). The end goal is to allow 4-bit quanized inference on CPU.
 
-**WORK IN PROGRESS!** **Status**: There is a Python wrapper, FP32 and FP16 inference work correctly. Currently, I'm working on INT4 quantization support.
+**WORK IN PROGRESS!** **Status**: FP32, FP16 and INT4 inference work. INT4 gives not so good quality, need to properly measure and compare perplexity.
 
 ## Plan
 
-1. Make INT4 inference work
-2. Create Python script with sampling and simple chat interface
+1. Create Python script with sampling and simple chat interface
+2. Measure performance and quality of different model sizes and data types
 3. Clean up the repo (remove llama related files and mentions)
 4. Write a good `README.md` and publish links to this repo
 5. Create pull request to main `ggml` repo with all improvements made here
diff --git a/ggml.c b/ggml.c
index a7d932f..38758fb 100644
--- a/ggml.c
+++ b/ggml.c
@@ -326,10 +326,42 @@ static float table_f32_f16[1 << 16];
 // This is also true for POWER9.
 #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
 
+// https://gist.github.com/rygorous/2144712
+// Public domain, by Fabian "ryg" Giesen
+inline static float ggml_half_to_float(uint16_t value) {
+    union FP32 {
+        uint32_t u;
+        float f;
+    };
+
+    const union FP32 magic = { (254UL - 15UL) << 23 };
+    const union FP32 was_inf_nan = { (127UL + 16UL) << 23 };
+
+    union FP32 out;
+
+    // Exponent/mantissa bits
+    out.u = (value & 0x7FFFU) << 13;
+    // Exponent adjust
+    out.f *= magic.f;
+
+    // Make sure Inf/NaN survive
+    if (out.f >= was_inf_nan.f) {
+        out.u |= 255UL << 23;
+    }
+
+    // Sign bit
+    out.u |= (value & 0x8000UL) << 16;
+
+    return out.f;
+}
+
 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    uint16_t s;
-    memcpy(&s, &f, sizeof(uint16_t));
-    return table_f32_f16[s];
+    // For some reason, lookup table does not work on my machine:
+    // - Windows SDK version 10.0.19041.0
+    // - CMAKE_SYSTEM_PROCESSOR: AMD64
+    // Replaced lookup with some conversion code found online.
+    // TODO This must be properly debugged and fixed
+    return ggml_half_to_float(f);
 }
 
 #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
diff --git a/rwkv.cpp b/rwkv.cpp
index 7110057..8f36f88 100644
--- a/rwkv.cpp
+++ b/rwkv.cpp
@@ -706,6 +706,7 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode
                 work.resize(nelements); // for quantization
 
                 size_t cur_size = 0;
+                // This is a histogramm of some values. If it shows single 1.0, then all 0.0, something went very wrong!
                 std::vector<int64_t> hist_cur(1 << 4, 0);
 
                 switch (type) {