Fix quantization from FP16

This commit is contained in:
saharNooby 2023-04-01 20:01:06 +04:00
parent 972e28d48d
commit 38f9d02d52
3 changed files with 39 additions and 6 deletions

View File

@ -2,12 +2,12 @@
This is a port of [BlinkDL/RWKV-LM](https://github.com/BlinkDL/RWKV-LM) to [ggerganov/ggml](https://github.com/ggerganov/ggml). The end goal is to allow 4-bit quanized inference on CPU. This is a port of [BlinkDL/RWKV-LM](https://github.com/BlinkDL/RWKV-LM) to [ggerganov/ggml](https://github.com/ggerganov/ggml). The end goal is to allow 4-bit quanized inference on CPU.
**WORK IN PROGRESS!** **Status**: There is a Python wrapper, FP32 and FP16 inference work correctly. Currently, I'm working on INT4 quantization support. **WORK IN PROGRESS!** **Status**: FP32, FP16 and INT4 inference work. INT4 gives not so good quality, need to properly measure and compare perplexity.
## Plan ## Plan
1. Make INT4 inference work 1. Create Python script with sampling and simple chat interface
2. Create Python script with sampling and simple chat interface 2. Measure performance and quality of different model sizes and data types
3. Clean up the repo (remove llama related files and mentions) 3. Clean up the repo (remove llama related files and mentions)
4. Write a good `README.md` and publish links to this repo 4. Write a good `README.md` and publish links to this repo
5. Create pull request to main `ggml` repo with all improvements made here 5. Create pull request to main `ggml` repo with all improvements made here

38
ggml.c
View File

@ -326,10 +326,42 @@ static float table_f32_f16[1 << 16];
// This is also true for POWER9. // This is also true for POWER9.
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
// https://gist.github.com/rygorous/2144712
// Public domain, by Fabian "ryg" Giesen
inline static float ggml_half_to_float(uint16_t value) {
union FP32 {
uint32_t u;
float f;
};
const union FP32 magic = { (254UL - 15UL) << 23 };
const union FP32 was_inf_nan = { (127UL + 16UL) << 23 };
union FP32 out;
// Exponent/mantissa bits
out.u = (value & 0x7FFFU) << 13;
// Exponent adjust
out.f *= magic.f;
// Make sure Inf/NaN survive
if (out.f >= was_inf_nan.f) {
out.u |= 255UL << 23;
}
// Sign bit
out.u |= (value & 0x8000UL) << 16;
return out.f;
}
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
uint16_t s; // For some reason, lookup table does not work on my machine:
memcpy(&s, &f, sizeof(uint16_t)); // - Windows SDK version 10.0.19041.0
return table_f32_f16[s]; // - CMAKE_SYSTEM_PROCESSOR: AMD64
// Replaced lookup with some conversion code found online.
// TODO This must be properly debugged and fixed
return ggml_half_to_float(f);
} }
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)

View File

@ -706,6 +706,7 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode
work.resize(nelements); // for quantization work.resize(nelements); // for quantization
size_t cur_size = 0; size_t cur_size = 0;
// This is a histogramm of some values. If it shows single 1.0, then all 0.0, something went very wrong!
std::vector<int64_t> hist_cur(1 << 4, 0); std::vector<int64_t> hist_cur(1 << 4, 0);
switch (type) { switch (type) {