Fix quantization from FP16
This commit is contained in:
parent
972e28d48d
commit
38f9d02d52
|
@ -2,12 +2,12 @@
|
|||
|
||||
This is a port of [BlinkDL/RWKV-LM](https://github.com/BlinkDL/RWKV-LM) to [ggerganov/ggml](https://github.com/ggerganov/ggml). The end goal is to allow 4-bit quanized inference on CPU.
|
||||
|
||||
**WORK IN PROGRESS!** **Status**: There is a Python wrapper, FP32 and FP16 inference work correctly. Currently, I'm working on INT4 quantization support.
|
||||
**WORK IN PROGRESS!** **Status**: FP32, FP16 and INT4 inference work. INT4 gives not so good quality, need to properly measure and compare perplexity.
|
||||
|
||||
## Plan
|
||||
|
||||
1. Make INT4 inference work
|
||||
2. Create Python script with sampling and simple chat interface
|
||||
1. Create Python script with sampling and simple chat interface
|
||||
2. Measure performance and quality of different model sizes and data types
|
||||
3. Clean up the repo (remove llama related files and mentions)
|
||||
4. Write a good `README.md` and publish links to this repo
|
||||
5. Create pull request to main `ggml` repo with all improvements made here
|
||||
|
|
38
ggml.c
38
ggml.c
|
@ -326,10 +326,42 @@ static float table_f32_f16[1 << 16];
|
|||
// This is also true for POWER9.
|
||||
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
|
||||
|
||||
// https://gist.github.com/rygorous/2144712
|
||||
// Public domain, by Fabian "ryg" Giesen
|
||||
inline static float ggml_half_to_float(uint16_t value) {
|
||||
union FP32 {
|
||||
uint32_t u;
|
||||
float f;
|
||||
};
|
||||
|
||||
const union FP32 magic = { (254UL - 15UL) << 23 };
|
||||
const union FP32 was_inf_nan = { (127UL + 16UL) << 23 };
|
||||
|
||||
union FP32 out;
|
||||
|
||||
// Exponent/mantissa bits
|
||||
out.u = (value & 0x7FFFU) << 13;
|
||||
// Exponent adjust
|
||||
out.f *= magic.f;
|
||||
|
||||
// Make sure Inf/NaN survive
|
||||
if (out.f >= was_inf_nan.f) {
|
||||
out.u |= 255UL << 23;
|
||||
}
|
||||
|
||||
// Sign bit
|
||||
out.u |= (value & 0x8000UL) << 16;
|
||||
|
||||
return out.f;
|
||||
}
|
||||
|
||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
uint16_t s;
|
||||
memcpy(&s, &f, sizeof(uint16_t));
|
||||
return table_f32_f16[s];
|
||||
// For some reason, lookup table does not work on my machine:
|
||||
// - Windows SDK version 10.0.19041.0
|
||||
// - CMAKE_SYSTEM_PROCESSOR: AMD64
|
||||
// Replaced lookup with some conversion code found online.
|
||||
// TODO This must be properly debugged and fixed
|
||||
return ggml_half_to_float(f);
|
||||
}
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||
|
|
1
rwkv.cpp
1
rwkv.cpp
|
@ -706,6 +706,7 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode
|
|||
work.resize(nelements); // for quantization
|
||||
|
||||
size_t cur_size = 0;
|
||||
// This is a histogramm of some values. If it shows single 1.0, then all 0.0, something went very wrong!
|
||||
std::vector<int64_t> hist_cur(1 << 4, 0);
|
||||
|
||||
switch (type) {
|
||||
|
|
Loading…
Reference in New Issue