Fix usage of F16C intrinsics in AVX code (#563)
* Fix usage of F16C intrinsics in AVX code when F16C is not defined
This commit is contained in:
		
							parent
							
								
									7b8dbcb78b
								
							
						
					
					
						commit
						a6bdc47cba
					
				
							
								
								
									
										25
									
								
								ggml.c
								
								
								
								
							
							
						
						
									
										25
									
								
								ggml.c
								
								
								
								
							|  | @ -1122,13 +1122,36 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) { | |||
| #define GGML_F16_EPR  8 | ||||
| 
 | ||||
| // F16 arithmetic is not supported by AVX, so we use F32 instead
 | ||||
| // we take advantage of the _mm256_cvt intrinsics to convert F16 <-> F32
 | ||||
| 
 | ||||
| #define GGML_F32Cx8             __m256 | ||||
| #define GGML_F32Cx8_ZERO        _mm256_setzero_ps() | ||||
| #define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x) | ||||
| 
 | ||||
| #if defined(__F16C__) | ||||
| // the  _mm256_cvt intrinsics require F16C
 | ||||
| #define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) | ||||
| #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) | ||||
| #else | ||||
| static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { | ||||
|     float tmp[8]; | ||||
| 
 | ||||
|     for (int i = 0; i < 8; i++) | ||||
|         tmp[i] = GGML_FP16_TO_FP32(x[i]); | ||||
| 
 | ||||
|     return _mm256_loadu_ps(tmp); | ||||
| } | ||||
| static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { | ||||
|     float arr[8]; | ||||
| 
 | ||||
|     _mm256_storeu_ps(arr, y); | ||||
| 
 | ||||
|     for (int i = 0; i < 8; i++) | ||||
|         x[i] = GGML_FP16_TO_FP32(arr[i]); | ||||
| } | ||||
| #define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x) | ||||
| #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) | ||||
| #endif | ||||
| 
 | ||||
| #define GGML_F32Cx8_FMA         GGML_F32x8_FMA | ||||
| #define GGML_F32Cx8_ADD         _mm256_add_ps | ||||
| #define GGML_F32Cx8_MUL         _mm256_mul_ps | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue