diff --git a/ggml.c b/ggml.c
index 6c946d0..70c4838 100644
--- a/ggml.c
+++ b/ggml.c
@@ -25,35 +25,6 @@
 #define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
 
-// https://gist.github.com/rygorous/2144712
-// Public domain, by Fabian "ryg" Giesen
-inline static float ggml_half_to_float_reference(uint16_t value) {
-    union FP32 {
-        uint32_t u;
-        float f;
-    };
-
-    const union FP32 magic = { (254UL - 15UL) << 23 };
-    const union FP32 was_inf_nan = { (127UL + 16UL) << 23 };
-
-    union FP32 out;
-
-    // Exponent/mantissa bits
-    out.u = (value & 0x7FFFU) << 13;
-    // Exponent adjust
-    out.f *= magic.f;
-
-    // Make sure Inf/NaN survive
-    if (out.f >= was_inf_nan.f) {
-        out.u |= 255UL << 23;
-    }
-
-    // Sign bit
-    out.u |= (value & 0x8000UL) << 16;
-
-    return out.f;
-}
-
 #if defined _MSC_VER || defined(__MINGW32__)
 
 #if !defined(__MINGW32__)
@@ -355,13 +326,10 @@ static float table_f32_f16[1 << 16];
 // This is also true for POWER9.
 #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
 
-
-
 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    // For some reason, lookup table does not work on my machine.
-    // Replaced lookup with working reference code.
-    // TODO This must be properly debugged and fixed
-    return ggml_half_to_float_reference(f);
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return table_f32_f16[s];
 }
 
 #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
@@ -1194,8 +1162,8 @@ static inline void quantize_row_q4_1_o_reference_single_block(const float * rest
 }
 
 static inline void dequantize_row_q4_1_o_reference_single_block(block_q4_1_o * restrict block, float * restrict y) {
-    const float d = ggml_half_to_float_reference(block->d);
-    const float m = ggml_half_to_float_reference(block->m);
+    const float d = GGML_FP16_TO_FP32(block->d);
+    const float m = GGML_FP16_TO_FP32(block->m);
 
     const uint8_t * restrict pp = block->qs;
 
@@ -1216,7 +1184,7 @@ static inline void dequantize_row_q4_1_o_reference_single_block(block_q4_1_o * r
     }
 
     // Restore the outlier
-    y[block->outlier_index] = ggml_half_to_float_reference(block->outlier_value);
+    y[block->outlier_index] = GGML_FP16_TO_FP32(block->outlier_value);
 }
 
 static void quantize_row_q4_1_o_reference(const float * restrict x, void * restrict vy, int k) {
@@ -1242,8 +1210,8 @@ static void dequantize_row_q4_1_o(const void * restrict vx, float * restrict y,
 
 #if defined(__AVX2__)
     for (int i = 0; i < nb; i++) {
-        const float x_d = ggml_half_to_float_reference(x[i].d);
-        const float x_m = ggml_half_to_float_reference(x[i].m);
+        const float x_d = GGML_FP16_TO_FP32(x[i].d);
+        const float x_m = GGML_FP16_TO_FP32(x[i].m);
 
         const __m256 d_v = _mm256_broadcast_ss(&x_d);
         const __m256 d_m = _mm256_broadcast_ss(&x_m);
@@ -1274,12 +1242,12 @@ static void dequantize_row_q4_1_o(const void * restrict vx, float * restrict y,
         }
 
         // Restore the outlier
-        y[i * QK + x[i].outlier_index] = ggml_half_to_float_reference(x[i].outlier_value);
+        y[i * QK + x[i].outlier_index] = GGML_FP16_TO_FP32(x[i].outlier_value);
     }
 #elif defined(__ARM_NEON)
     for (int i = 0; i < nb; i++) {
-        const float x_d = ggml_half_to_float_reference(x[i].d);
-        const float x_m = ggml_half_to_float_reference(x[i].m);
+        const float x_d = GGML_FP16_TO_FP32(x[i].d);
+        const float x_m = GGML_FP16_TO_FP32(x[i].m);
 
         const float32x4_t vd = vdupq_n_f32(x_d);
         const float32x4_t vm = vdupq_n_f32(x_m);
@@ -1324,7 +1292,7 @@ static void dequantize_row_q4_1_o(const void * restrict vx, float * restrict y,
         }
 
         // Restore the outlier
-        y[i * QK + x[i].outlier_index] = ggml_half_to_float_reference(x[i].outlier_value);
+        y[i * QK + x[i].outlier_index] = GGML_FP16_TO_FP32(x[i].outlier_value);
     }
 #else
     for (int i = 0; i < nb; i++) {
@@ -7292,12 +7260,12 @@ static void ggml_compute_forward_mul_mat_q4_1_o_f32(
 
             // Here we do fused dequantization and dot product.
             for (int block_index = 0; block_index < block_count; block_index++) {
-                const float block_d = ggml_half_to_float_reference(row_blocks[block_index].d);
-                const float block_m = ggml_half_to_float_reference(row_blocks[block_index].m);
+                const float block_d = GGML_FP16_TO_FP32(row_blocks[block_index].d);
+                const float block_m = GGML_FP16_TO_FP32(row_blocks[block_index].m);
 
                 // 0 .. 31
                 const uint16_t outlier_index = row_blocks[block_index].outlier_index;
-                const float outlier_value = ggml_half_to_float_reference(row_blocks[block_index].outlier_value);
+                const float outlier_value = GGML_FP16_TO_FP32(row_blocks[block_index].outlier_value);
 
                 const uint8_t * restrict quant_nibbles = row_blocks[block_index].qs;
 
@@ -11473,11 +11441,11 @@ void ggml_test_quantization_q4_1_o(void) {
     size_t size = ggml_quantize_q4_1_o(src, dst, QK, QK, hist);
     GGML_TEST_ASSERT(size == 24, "%zd", size);
 
-    float delta_result = ggml_half_to_float_reference(((block_q4_1_o *) dst)->d);
+    float delta_result = GGML_FP16_TO_FP32(((block_q4_1_o *) dst)->d);
     float delta_expected = (src[30] - src[0]) / ((1 << 4) - 1);
     GGML_TEST_ASSERT(delta_result == delta_expected, "%f, %f", delta_result, delta_expected);
 
-    float min_result = ggml_half_to_float_reference(((block_q4_1_o *) dst)->m);
+    float min_result = GGML_FP16_TO_FP32(((block_q4_1_o *) dst)->m);
     float min_expected = src[0];
     GGML_TEST_ASSERT(min_result == min_expected, "%f, %f", min_result, min_expected);
 
@@ -11485,7 +11453,7 @@ void ggml_test_quantization_q4_1_o(void) {
     uint16_t outlier_index_expected = 31;
     GGML_TEST_ASSERT(outlier_index == outlier_index_expected, "%d, %d", outlier_index, outlier_index_expected);
 
-    float outlier_value_result = ggml_half_to_float_reference(((block_q4_1_o *) dst)->outlier_value);
+    float outlier_value_result = GGML_FP16_TO_FP32(((block_q4_1_o *) dst)->outlier_value);
     float outlier_value_expected = src[31];
     GGML_TEST_ASSERT(outlier_value_result == outlier_value_expected, "%f, %f", outlier_value_result, outlier_value_expected);
 
diff --git a/rwkv.cpp b/rwkv.cpp
index 0c331c3..3e004b6 100644
--- a/rwkv.cpp
+++ b/rwkv.cpp
@@ -557,6 +557,13 @@ void rwkv_free(struct rwkv_context * ctx) {
 bool rwkv_quantize_model_file(const char * model_file_path_in, const char * model_file_path_out, uint32_t q_type) {
     RWKV_ASSERT_FALSE(q_type == 2 || q_type == 3 || q_type == 4, "Unsupported quantization type %d", q_type);
 
+    // Needed to initialize FP16 lookup table
+    {
+        struct ggml_init_params params = { 0, NULL };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
     ggml_type type = FORMAT_TYPE_TO_GGML_TYPE[q_type];
 
     printf("Loading model from '%s'\n", model_file_path_in);