Fix FP16 lookup table

This commit is contained in:
saharNooby 2023-04-17 16:39:43 +04:00
parent 678f5233a5
commit 2f37c6b019
2 changed files with 25 additions and 50 deletions

68
ggml.c
View File

@ -25,35 +25,6 @@
#define static_assert(cond, msg) struct global_scope_noop_trick #define static_assert(cond, msg) struct global_scope_noop_trick
#endif #endif
// https://gist.github.com/rygorous/2144712
// Public domain, by Fabian "ryg" Giesen
inline static float ggml_half_to_float_reference(uint16_t value) {
union FP32 {
uint32_t u;
float f;
};
const union FP32 magic = { (254UL - 15UL) << 23 };
const union FP32 was_inf_nan = { (127UL + 16UL) << 23 };
union FP32 out;
// Exponent/mantissa bits
out.u = (value & 0x7FFFU) << 13;
// Exponent adjust
out.f *= magic.f;
// Make sure Inf/NaN survive
if (out.f >= was_inf_nan.f) {
out.u |= 255UL << 23;
}
// Sign bit
out.u |= (value & 0x8000UL) << 16;
return out.f;
}
#if defined _MSC_VER || defined(__MINGW32__) #if defined _MSC_VER || defined(__MINGW32__)
#if !defined(__MINGW32__) #if !defined(__MINGW32__)
@ -355,13 +326,10 @@ static float table_f32_f16[1 << 16];
// This is also true for POWER9. // This is also true for POWER9.
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
// For some reason, lookup table does not work on my machine. uint16_t s;
// Replaced lookup with working reference code. memcpy(&s, &f, sizeof(uint16_t));
// TODO This must be properly debugged and fixed return table_f32_f16[s];
return ggml_half_to_float_reference(f);
} }
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
@ -1194,8 +1162,8 @@ static inline void quantize_row_q4_1_o_reference_single_block(const float * rest
} }
static inline void dequantize_row_q4_1_o_reference_single_block(block_q4_1_o * restrict block, float * restrict y) { static inline void dequantize_row_q4_1_o_reference_single_block(block_q4_1_o * restrict block, float * restrict y) {
const float d = ggml_half_to_float_reference(block->d); const float d = GGML_FP16_TO_FP32(block->d);
const float m = ggml_half_to_float_reference(block->m); const float m = GGML_FP16_TO_FP32(block->m);
const uint8_t * restrict pp = block->qs; const uint8_t * restrict pp = block->qs;
@ -1216,7 +1184,7 @@ static inline void dequantize_row_q4_1_o_reference_single_block(block_q4_1_o * r
} }
// Restore the outlier // Restore the outlier
y[block->outlier_index] = ggml_half_to_float_reference(block->outlier_value); y[block->outlier_index] = GGML_FP16_TO_FP32(block->outlier_value);
} }
static void quantize_row_q4_1_o_reference(const float * restrict x, void * restrict vy, int k) { static void quantize_row_q4_1_o_reference(const float * restrict x, void * restrict vy, int k) {
@ -1242,8 +1210,8 @@ static void dequantize_row_q4_1_o(const void * restrict vx, float * restrict y,
#if defined(__AVX2__) #if defined(__AVX2__)
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
const float x_d = ggml_half_to_float_reference(x[i].d); const float x_d = GGML_FP16_TO_FP32(x[i].d);
const float x_m = ggml_half_to_float_reference(x[i].m); const float x_m = GGML_FP16_TO_FP32(x[i].m);
const __m256 d_v = _mm256_broadcast_ss(&x_d); const __m256 d_v = _mm256_broadcast_ss(&x_d);
const __m256 d_m = _mm256_broadcast_ss(&x_m); const __m256 d_m = _mm256_broadcast_ss(&x_m);
@ -1274,12 +1242,12 @@ static void dequantize_row_q4_1_o(const void * restrict vx, float * restrict y,
} }
// Restore the outlier // Restore the outlier
y[i * QK + x[i].outlier_index] = ggml_half_to_float_reference(x[i].outlier_value); y[i * QK + x[i].outlier_index] = GGML_FP16_TO_FP32(x[i].outlier_value);
} }
#elif defined(__ARM_NEON) #elif defined(__ARM_NEON)
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
const float x_d = ggml_half_to_float_reference(x[i].d); const float x_d = GGML_FP16_TO_FP32(x[i].d);
const float x_m = ggml_half_to_float_reference(x[i].m); const float x_m = GGML_FP16_TO_FP32(x[i].m);
const float32x4_t vd = vdupq_n_f32(x_d); const float32x4_t vd = vdupq_n_f32(x_d);
const float32x4_t vm = vdupq_n_f32(x_m); const float32x4_t vm = vdupq_n_f32(x_m);
@ -1324,7 +1292,7 @@ static void dequantize_row_q4_1_o(const void * restrict vx, float * restrict y,
} }
// Restore the outlier // Restore the outlier
y[i * QK + x[i].outlier_index] = ggml_half_to_float_reference(x[i].outlier_value); y[i * QK + x[i].outlier_index] = GGML_FP16_TO_FP32(x[i].outlier_value);
} }
#else #else
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
@ -7292,12 +7260,12 @@ static void ggml_compute_forward_mul_mat_q4_1_o_f32(
// Here we do fused dequantization and dot product. // Here we do fused dequantization and dot product.
for (int block_index = 0; block_index < block_count; block_index++) { for (int block_index = 0; block_index < block_count; block_index++) {
const float block_d = ggml_half_to_float_reference(row_blocks[block_index].d); const float block_d = GGML_FP16_TO_FP32(row_blocks[block_index].d);
const float block_m = ggml_half_to_float_reference(row_blocks[block_index].m); const float block_m = GGML_FP16_TO_FP32(row_blocks[block_index].m);
// 0 .. 31 // 0 .. 31
const uint16_t outlier_index = row_blocks[block_index].outlier_index; const uint16_t outlier_index = row_blocks[block_index].outlier_index;
const float outlier_value = ggml_half_to_float_reference(row_blocks[block_index].outlier_value); const float outlier_value = GGML_FP16_TO_FP32(row_blocks[block_index].outlier_value);
const uint8_t * restrict quant_nibbles = row_blocks[block_index].qs; const uint8_t * restrict quant_nibbles = row_blocks[block_index].qs;
@ -11473,11 +11441,11 @@ void ggml_test_quantization_q4_1_o(void) {
size_t size = ggml_quantize_q4_1_o(src, dst, QK, QK, hist); size_t size = ggml_quantize_q4_1_o(src, dst, QK, QK, hist);
GGML_TEST_ASSERT(size == 24, "%zd", size); GGML_TEST_ASSERT(size == 24, "%zd", size);
float delta_result = ggml_half_to_float_reference(((block_q4_1_o *) dst)->d); float delta_result = GGML_FP16_TO_FP32(((block_q4_1_o *) dst)->d);
float delta_expected = (src[30] - src[0]) / ((1 << 4) - 1); float delta_expected = (src[30] - src[0]) / ((1 << 4) - 1);
GGML_TEST_ASSERT(delta_result == delta_expected, "%f, %f", delta_result, delta_expected); GGML_TEST_ASSERT(delta_result == delta_expected, "%f, %f", delta_result, delta_expected);
float min_result = ggml_half_to_float_reference(((block_q4_1_o *) dst)->m); float min_result = GGML_FP16_TO_FP32(((block_q4_1_o *) dst)->m);
float min_expected = src[0]; float min_expected = src[0];
GGML_TEST_ASSERT(min_result == min_expected, "%f, %f", min_result, min_expected); GGML_TEST_ASSERT(min_result == min_expected, "%f, %f", min_result, min_expected);
@ -11485,7 +11453,7 @@ void ggml_test_quantization_q4_1_o(void) {
uint16_t outlier_index_expected = 31; uint16_t outlier_index_expected = 31;
GGML_TEST_ASSERT(outlier_index == outlier_index_expected, "%d, %d", outlier_index, outlier_index_expected); GGML_TEST_ASSERT(outlier_index == outlier_index_expected, "%d, %d", outlier_index, outlier_index_expected);
float outlier_value_result = ggml_half_to_float_reference(((block_q4_1_o *) dst)->outlier_value); float outlier_value_result = GGML_FP16_TO_FP32(((block_q4_1_o *) dst)->outlier_value);
float outlier_value_expected = src[31]; float outlier_value_expected = src[31];
GGML_TEST_ASSERT(outlier_value_result == outlier_value_expected, "%f, %f", outlier_value_result, outlier_value_expected); GGML_TEST_ASSERT(outlier_value_result == outlier_value_expected, "%f, %f", outlier_value_result, outlier_value_expected);

View File

@ -557,6 +557,13 @@ void rwkv_free(struct rwkv_context * ctx) {
bool rwkv_quantize_model_file(const char * model_file_path_in, const char * model_file_path_out, uint32_t q_type) { bool rwkv_quantize_model_file(const char * model_file_path_in, const char * model_file_path_out, uint32_t q_type) {
RWKV_ASSERT_FALSE(q_type == 2 || q_type == 3 || q_type == 4, "Unsupported quantization type %d", q_type); RWKV_ASSERT_FALSE(q_type == 2 || q_type == 3 || q_type == 4, "Unsupported quantization type %d", q_type);
// Needed to initialize FP16 lookup table
{
struct ggml_init_params params = { 0, NULL };
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
}
ggml_type type = FORMAT_TYPE_TO_GGML_TYPE[q_type]; ggml_type type = FORMAT_TYPE_TO_GGML_TYPE[q_type];
printf("Loading model from '%s'\n", model_file_path_in); printf("Loading model from '%s'\n", model_file_path_in);