Deduplicate q4 quantization functions (#383)
* Deduplicate q4 quantization functions * Use const; add basic test * Re-enable quantization test * Disable AVX2 flags in CI --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
							parent
							
								
									97940520e8
								
							
						
					
					
						commit
						69c92298a9
					
				|  | @ -89,7 +89,7 @@ jobs: | |||
|         run: | | ||||
|           mkdir build | ||||
|           cd build | ||||
|           cmake .. | ||||
|           cmake -DLLAMA_AVX2=OFF .. | ||||
|           cmake --build . --config Release | ||||
|           ctest --output-on-failure | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										155
									
								
								ggml.c
								
								
								
								
							
							
						
						
									
										155
									
								
								ggml.c
								
								
								
								
							|  | @ -403,9 +403,55 @@ static inline __m128i packNibbles( __m256i bytes ) | |||
| // method 5
 | ||||
| // blocks of QK elements
 | ||||
| // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
 | ||||
| 
 | ||||
| // reference implementation for deterministic creation of model files
 | ||||
| static void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) { | ||||
|     assert(k % QK == 0); | ||||
|     const int nb = k / QK; | ||||
| 
 | ||||
|     const size_t bs = sizeof(float) + QK/2; | ||||
| 
 | ||||
|     uint8_t * restrict pd = ((uint8_t *)y + 0*bs); | ||||
|     uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float)); | ||||
| 
 | ||||
|     uint8_t pp[QK/2]; | ||||
| 
 | ||||
|     for (int i = 0; i < nb; i++) { | ||||
|         float amax = 0.0f; // absolute max
 | ||||
| 
 | ||||
|         for (int l = 0; l < QK; l++) { | ||||
|             const float v = x[i*QK + l]; | ||||
|             amax = MAX(amax, fabsf(v)); | ||||
|         } | ||||
| 
 | ||||
|         const float d = amax / ((1 << 3) - 1); | ||||
|         const float id = d ? 1.0f/d : 0.0f; | ||||
| 
 | ||||
|         *(float *)pd = d; | ||||
|         pd += bs; | ||||
| 
 | ||||
|         for (int l = 0; l < QK; l += 2) { | ||||
|             const float v0 = x[i*QK + l + 0]*id; | ||||
|             const float v1 = x[i*QK + l + 1]*id; | ||||
| 
 | ||||
|             const uint8_t vi0 = ((int8_t) (round(v0))) + 8; | ||||
|             const uint8_t vi1 = ((int8_t) (round(v1))) + 8; | ||||
| 
 | ||||
|             assert(vi0 >= 0 && vi0 < 16); | ||||
|             assert(vi1 >= 0 && vi1 < 16); | ||||
| 
 | ||||
|             pp[l/2] = vi0 | (vi1 << 4); | ||||
|         } | ||||
| 
 | ||||
|         memcpy(pb, pp, sizeof(pp)); | ||||
|         pb += bs; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { | ||||
|     assert(k % QK == 0); | ||||
| 
 | ||||
| #if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) | ||||
|     const int nb = k / QK; | ||||
|     const size_t bs = sizeof(float) + QK/2; | ||||
| 
 | ||||
|  | @ -413,6 +459,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { | |||
|     uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float)); | ||||
| 
 | ||||
|     uint8_t pp[QK/2]; | ||||
| #endif | ||||
| 
 | ||||
| #if __ARM_NEON | ||||
| #if QK == 32 | ||||
|  | @ -569,36 +616,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { | |||
| #endif | ||||
| #else | ||||
|     // scalar
 | ||||
|     for (int i = 0; i < nb; i++) { | ||||
|         float amax = 0.0f; // absolute max
 | ||||
| 
 | ||||
|         for (int l = 0; l < QK; l++) { | ||||
|             const float v = x[i*QK + l]; | ||||
|             amax = MAX(amax, fabsf(v)); | ||||
|         } | ||||
| 
 | ||||
|         const float d = amax / ((1 << 3) - 1); | ||||
|         const float id = d ? 1.0f/d : 0.0f; | ||||
| 
 | ||||
|         *(float *)pd = d; | ||||
|         pd += bs; | ||||
| 
 | ||||
|         for (int l = 0; l < QK; l += 2) { | ||||
|             const float v0 = x[i*QK + l + 0]*id; | ||||
|             const float v1 = x[i*QK + l + 1]*id; | ||||
| 
 | ||||
|             const uint8_t vi0 = ((int8_t) (round(v0))) + 8; | ||||
|             const uint8_t vi1 = ((int8_t) (round(v1))) + 8; | ||||
| 
 | ||||
|             assert(vi0 >= 0 && vi0 < 16); | ||||
|             assert(vi1 >= 0 && vi1 < 16); | ||||
| 
 | ||||
|             pp[l/2] = vi0 | (vi1 << 4); | ||||
|         } | ||||
| 
 | ||||
|         memcpy(pb, pp, sizeof(pp)); | ||||
|         pb += bs; | ||||
|     } | ||||
|     quantize_row_q4_0_reference(x, y, k); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
|  | @ -10705,121 +10723,62 @@ enum ggml_opt_result ggml_opt( | |||
| 
 | ||||
| ////////////////////////////////////////////////////////////////////////////////
 | ||||
| 
 | ||||
| size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) { | ||||
| size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist) { | ||||
|     const int nb = k / qk; | ||||
|     const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2); | ||||
|     const size_t row_size = nb*bs; | ||||
| 
 | ||||
|     assert(k % qk == 0); | ||||
| 
 | ||||
|     const size_t pp_size = qk / 2; | ||||
|     uint8_t * pp = (uint8_t *) alloca(pp_size); | ||||
| 
 | ||||
|     char * pdst = (char *) dst; | ||||
| 
 | ||||
|     for (int j = 0; j < n; j += k) { | ||||
|         uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); | ||||
|         uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float)); | ||||
| 
 | ||||
|         quantize_row_q4_0_reference(src + j, pd, k); | ||||
| 
 | ||||
|         for (int i = 0; i < nb; i++) { | ||||
|             float amax = 0.0f; // absolute max
 | ||||
| 
 | ||||
|             { | ||||
|                 for (int l = 0; l < qk; l++) { | ||||
|                     const float v = src[j + i*qk + l]; | ||||
|                     amax = MAX(amax, fabsf(v)); | ||||
|                 } | ||||
| 
 | ||||
|                 const float d = amax / ((1 << 3) - 1); | ||||
|                 const float id = d ? 1.0f/d : 0.0f; | ||||
| 
 | ||||
|                 *(float *) pd = d; | ||||
|                 pd += bs; | ||||
| 
 | ||||
|             for (int l = 0; l < qk; l += 2) { | ||||
|                     const float v0 = (src[j + i*qk + l + 0])*id; | ||||
|                     const float v1 = (src[j + i*qk + l + 1])*id; | ||||
| 
 | ||||
|                     const uint8_t vi0 = ((int8_t) (round(v0))) + 8; | ||||
|                     const uint8_t vi1 = ((int8_t) (round(v1))) + 8; | ||||
| 
 | ||||
|                     assert(vi0 >= 0 && vi0 < 16); | ||||
|                     assert(vi1 >= 0 && vi1 < 16); | ||||
|                 const uint8_t vi0 = pb[l/2] & 0xF; | ||||
|                 const uint8_t vi1 = pb[l/2] >> 4; | ||||
| 
 | ||||
|                 hist[vi0]++; | ||||
|                 hist[vi1]++; | ||||
| 
 | ||||
|                     pp[l/2] = vi0 | (vi1 << 4); | ||||
|             } | ||||
| 
 | ||||
|                 memcpy(pb, pp, pp_size); | ||||
|             pb += bs; | ||||
|         } | ||||
|     } | ||||
|     } | ||||
| 
 | ||||
|     return (n/k)*row_size; | ||||
| } | ||||
| 
 | ||||
| size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) { | ||||
| size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist) { | ||||
|     const int nb = k / qk; | ||||
|     const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2); | ||||
|     const size_t row_size = nb*bs; | ||||
| 
 | ||||
|     assert(k % qk == 0); | ||||
| 
 | ||||
|     const size_t pp_size = qk / 2; | ||||
|     uint8_t * pp = (uint8_t *) alloca(pp_size); | ||||
| 
 | ||||
|     char * pdst = (char *) dst; | ||||
| 
 | ||||
|     for (int j = 0; j < n; j += k) { | ||||
|         uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); | ||||
|         uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs +   sizeof(float)); | ||||
|         uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float)); | ||||
| 
 | ||||
|         //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
 | ||||
|         quantize_row_q4_1(src + j, pd, k); | ||||
| 
 | ||||
|         for (int i = 0; i < nb; i++) { | ||||
|             float min = FLT_MAX; | ||||
|             float max = -FLT_MAX; | ||||
| 
 | ||||
|             { | ||||
|                 for (int l = 0; l < qk; l++) { | ||||
|                     const float v = src[j + i*qk + l]; | ||||
|                     if (v < min) min = v; | ||||
|                     if (v > max) max = v; | ||||
|                 } | ||||
| 
 | ||||
|                 const float d = (max - min) / ((1 << 4) - 1); | ||||
|                 const float id = d ? 1.0f/d : 0.0f; | ||||
| 
 | ||||
|                 *(float *) pd = d; | ||||
|                 *(float *) pm = min; | ||||
|                 pd += bs; | ||||
|                 pm += bs; | ||||
| 
 | ||||
|             for (int l = 0; l < qk; l += 2) { | ||||
|                     const float v0 = (src[j + i*qk + l + 0] - min)*id; | ||||
|                     const float v1 = (src[j + i*qk + l + 1] - min)*id; | ||||
| 
 | ||||
|                     const uint8_t vi0 = round(v0); | ||||
|                     const uint8_t vi1 = round(v1); | ||||
| 
 | ||||
|                     assert(vi0 >= 0 && vi0 < 16); | ||||
|                     assert(vi1 >= 0 && vi1 < 16); | ||||
|                 const uint8_t vi0 = pb[l/2] & 0xF; | ||||
|                 const uint8_t vi1 = pb[l/2] >> 4; | ||||
| 
 | ||||
|                 hist[vi0]++; | ||||
|                 hist[vi1]++; | ||||
| 
 | ||||
|                     pp[l/2] = vi0 | (vi1 << 4); | ||||
|             } | ||||
| 
 | ||||
|                 memcpy(pb, pp, pp_size); | ||||
|             pb += bs; | ||||
|         } | ||||
|     } | ||||
|     } | ||||
| 
 | ||||
|     return (n/k)*row_size; | ||||
| } | ||||
|  |  | |||
							
								
								
									
										4
									
								
								ggml.h
								
								
								
								
							
							
						
						
									
										4
									
								
								ggml.h
								
								
								
								
							|  | @ -745,8 +745,8 @@ enum ggml_opt_result ggml_opt( | |||
| // quantization
 | ||||
| //
 | ||||
| 
 | ||||
| size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist); | ||||
| size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist); | ||||
| size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist); | ||||
| size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist); | ||||
| 
 | ||||
| //
 | ||||
| // system info
 | ||||
|  |  | |||
|  | @ -1,4 +1,9 @@ | |||
| set(TEST_TARGET test-tokenizer-0) | ||||
| add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) | ||||
| function(llama_add_test source) | ||||
|     get_filename_component(TEST_TARGET ${source} NAME_WE) | ||||
|     add_executable(${TEST_TARGET} ${source}) | ||||
|     target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils) | ||||
| add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) | ||||
|     add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN}) | ||||
| endfunction() | ||||
| 
 | ||||
| llama_add_test(test-quantize.c) | ||||
| llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) | ||||
|  |  | |||
|  | @ -0,0 +1,42 @@ | |||
| #include "ggml.h" | ||||
| #undef NDEBUG | ||||
| #include <assert.h> | ||||
| #include <math.h> | ||||
| 
 | ||||
| int main(void) { | ||||
|     #define QK 32 | ||||
|     float src[QK]; | ||||
|     uint8_t dst[24]; | ||||
|     int64_t hist[16]; | ||||
| 
 | ||||
|     for (int i = 0; i < QK; i++) { | ||||
|         src[i] = (float)(i + 1); | ||||
|     } | ||||
| 
 | ||||
|     size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist); | ||||
|     assert(size == 20); | ||||
|     float max_result = ((float *)dst)[0]; | ||||
|     float max_expected = src[31] / ((1 << 3) - 1); | ||||
|     assert(max_result == max_expected); | ||||
|     for (int i = 0; i < QK; i++) { | ||||
|         uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF); | ||||
|         uint8_t q4_expected = roundf(src[i] / max_expected) + 8; | ||||
|         assert(q4_result == q4_expected); | ||||
|     } | ||||
| 
 | ||||
|     size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist); | ||||
|     assert(size == 24); | ||||
|     float delta_result = ((float *)dst)[0]; | ||||
|     float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1); | ||||
|     assert(delta_result == delta_expected); | ||||
|     float min_result = ((float *)dst)[1]; | ||||
|     float min_expected = src[0]; | ||||
|     assert(min_result == min_expected); | ||||
|     for (int i = 0; i < QK; i++) { | ||||
|         uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF); | ||||
|         uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected); | ||||
|         assert(q4_result == q4_expected); | ||||
|     } | ||||
| 
 | ||||
|     return 0; | ||||
| } | ||||
		Loading…
	
		Reference in New Issue