Do not quantize head

This commit is contained in:
saharNooby 2023-04-06 16:26:18 +04:00
parent 058b5cd1e6
commit ec99bc1765
1 changed files with 7 additions and 2 deletions

View File

@ -650,6 +650,7 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode
}; };
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], parameter_data_type_str[parameter_data_type]); printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], parameter_data_type_str[parameter_data_type]);
// TODO Should not be hardcoded here, but read from ggml
static const float parameter_data_type_size[] = { static const float parameter_data_type_size[] = {
4.0F, 4.0F,
2.0F, 2.0F,
@ -659,8 +660,12 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode
total_size_orig += (size_t) (nelements * parameter_data_type_size[parameter_data_type]); total_size_orig += (size_t) (nelements * parameter_data_type_size[parameter_data_type]);
} }
// Quantize only 2D tensors // Quantize only 2D tensors, except embedding and head matrices.
bool quantize = n_dims == 2; // Embedding and head take not too much space, especially in bigger models;
// but they significantly increase perplexity when quantized.
bool quantize = n_dims == 2 &&
name != std::string("emb.weight") &&
name != std::string("head.weight");
if (quantize) { if (quantize) {
if (parameter_data_type != 0 && parameter_data_type != 1) { if (parameter_data_type != 0 && parameter_data_type != 1) {