Minor changes
This commit is contained in:
parent
93c8dcae75
commit
fe272dc3d3
|
@ -14,7 +14,7 @@ Inference code runs, but outputs all `NaN`s in logits, most probably due to miss
|
||||||
2. Validate states and logits against [reference implementation](https://github.com/BlinkDL/ChatRWKV/blob/main/RWKV_in_150_lines.py) by creating a testing script
|
2. Validate states and logits against [reference implementation](https://github.com/BlinkDL/ChatRWKV/blob/main/RWKV_in_150_lines.py) by creating a testing script
|
||||||
3. Heavily refactor code; optimize where possible
|
3. Heavily refactor code; optimize where possible
|
||||||
4. Make FP16 inference work
|
4. Make FP16 inference work
|
||||||
5. Create fancy interface with sockets/shared memory/pipes/etc.
|
5. Create fancy interface with sockets/shared memory/pipes/something else
|
||||||
6. Create Python wrapper with sampling and simple chat interface
|
6. Create Python wrapper with sampling and simple chat interface
|
||||||
7. Write a good `README.md` and publish links to this repo
|
7. Write a good `README.md` and publish links to this repo
|
||||||
8. Make INT4 inference work
|
8. Make INT4 inference work
|
||||||
|
@ -27,7 +27,7 @@ This repo is based on the [llama.cpp repo](https://github.com/ggerganov/llama.cp
|
||||||
- `./rwkv`: directory containing Python scripts
|
- `./rwkv`: directory containing Python scripts
|
||||||
- `./examples/main_rwkw`: directory containing script that loads and infers RWKV model
|
- `./examples/main_rwkw`: directory containing script that loads and infers RWKV model
|
||||||
|
|
||||||
Please do not change files in other directories -- this will make pulling recent changes easier.
|
Please do not change files in other directories — this will make pulling recent changes easier.
|
||||||
|
|
||||||
## How to use
|
## How to use
|
||||||
|
|
||||||
|
@ -46,7 +46,7 @@ cmake .
|
||||||
Download an RWKV model from [Huggingface](https://huggingface.co/BlinkDL) and convert it into `ggml` format:
|
Download an RWKV model from [Huggingface](https://huggingface.co/BlinkDL) and convert it into `ggml` format:
|
||||||
|
|
||||||
```commandline
|
```commandline
|
||||||
python convert_pytorch_rwkv_to_ggml.py C:\RWKV-4-Pile-169M-20220807-8023.pth C:\rwkv.cpp-169M.bin float32
|
python rwkv\convert_pytorch_rwkv_to_ggml.py C:\RWKV-4-Pile-169M-20220807-8023.pth C:\rwkv.cpp-169M.bin float32
|
||||||
```
|
```
|
||||||
|
|
||||||
Compile and run the script:
|
Compile and run the script:
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
|
|
||||||
// --- Utilities ---
|
// --- Utilities ---
|
||||||
|
|
||||||
|
// Checks that x is not false. If it is false, prints fancy message to stderr and aborts the execution.
|
||||||
#define RWKV_ASSERT(x, ...) \
|
#define RWKV_ASSERT(x, ...) \
|
||||||
do { \
|
do { \
|
||||||
if (!(x)) { \
|
if (!(x)) { \
|
||||||
|
@ -24,6 +25,9 @@
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
// Formats and prints a message to stderr. Trailing newline is added automatically.
|
||||||
|
#define RWKV_LOG(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); } while (0)
|
||||||
|
|
||||||
// TODO Move to ggml, if correct
|
// TODO Move to ggml, if correct
|
||||||
float ggml_get_f32_2d(struct ggml_tensor * tensor, int i, int j) {
|
float ggml_get_f32_2d(struct ggml_tensor * tensor, int i, int j) {
|
||||||
RWKV_ASSERT(tensor->n_dims == 2, "Not a 2D tensor");
|
RWKV_ASSERT(tensor->n_dims == 2, "Not a 2D tensor");
|
||||||
|
@ -46,8 +50,8 @@ void print_tensor(struct ggml_tensor * tensor, char * name) {
|
||||||
|
|
||||||
RWKV_ASSERT(x >= 6, "Too small tensor");
|
RWKV_ASSERT(x >= 6, "Too small tensor");
|
||||||
|
|
||||||
printf(
|
RWKV_LOG(
|
||||||
"1D tensor %s, shape (%d), [%f %f %f ... %f %f %f]\n",
|
"1D tensor %s, shape (%d), [%f %f %f ... %f %f %f]",
|
||||||
name,
|
name,
|
||||||
x,
|
x,
|
||||||
ggml_get_f32_1d(tensor, 0),
|
ggml_get_f32_1d(tensor, 0),
|
||||||
|
@ -62,8 +66,8 @@ void print_tensor(struct ggml_tensor * tensor, char * name) {
|
||||||
int y = tensor->ne[1];
|
int y = tensor->ne[1];
|
||||||
|
|
||||||
if (y < 6) {
|
if (y < 6) {
|
||||||
printf(
|
RWKV_LOG(
|
||||||
"2D tensor %s, shape (%d, %d), [[%f %f %f ... %f %f %f]]\n",
|
"2D tensor %s, shape (%d, %d), [[%f %f %f ... %f %f %f]]",
|
||||||
name,
|
name,
|
||||||
x,
|
x,
|
||||||
y,
|
y,
|
||||||
|
@ -75,8 +79,8 @@ void print_tensor(struct ggml_tensor * tensor, char * name) {
|
||||||
ggml_get_f32_2d(tensor, x - 1, y - 1)
|
ggml_get_f32_2d(tensor, x - 1, y - 1)
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
printf(
|
RWKV_LOG(
|
||||||
"2D tensor %s, shape (%d, %d), [[%f %f %f ... ] ... [ ... %f %f %f]]\n",
|
"2D tensor %s, shape (%d, %d), [[%f %f %f ... ] ... [ ... %f %f %f]]",
|
||||||
name,
|
name,
|
||||||
x,
|
x,
|
||||||
y,
|
y,
|
||||||
|
@ -88,26 +92,6 @@ void print_tensor(struct ggml_tensor * tensor, char * name) {
|
||||||
ggml_get_f32_2d(tensor, x - 1, y - 1)
|
ggml_get_f32_2d(tensor, x - 1, y - 1)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
} else if (n_dims == 3) {
|
|
||||||
int x = tensor->ne[0];
|
|
||||||
int y = tensor->ne[1];
|
|
||||||
int z = tensor->ne[2];
|
|
||||||
|
|
||||||
RWKV_ASSERT(z >= 6, "Too small tensor");
|
|
||||||
|
|
||||||
printf(
|
|
||||||
"3D tensor %s, shape (%d, %d, %d), [[[%f %f %f ...] ... [... %f %f %f]]]\n",
|
|
||||||
name,
|
|
||||||
x,
|
|
||||||
y,
|
|
||||||
z,
|
|
||||||
ggml_get_f32_3d(tensor, 0, 0, 0),
|
|
||||||
ggml_get_f32_3d(tensor, 0, 0, 1),
|
|
||||||
ggml_get_f32_3d(tensor, 0, 0, 2),
|
|
||||||
ggml_get_f32_3d(tensor, x - 1, y - 1, z - 3),
|
|
||||||
ggml_get_f32_3d(tensor, x - 1, y - 1, z - 2),
|
|
||||||
ggml_get_f32_3d(tensor, x - 1, y - 1, z - 1)
|
|
||||||
);
|
|
||||||
} else {
|
} else {
|
||||||
RWKV_ASSERT(false, "Unsupported dimension count %d", n_dims);
|
RWKV_ASSERT(false, "Unsupported dimension count %d", n_dims);
|
||||||
}
|
}
|
||||||
|
@ -116,7 +100,7 @@ void print_tensor(struct ggml_tensor * tensor, char * name) {
|
||||||
// Prints tensor name, dimensionality, shape and part of its contents.
|
// Prints tensor name, dimensionality, shape and part of its contents.
|
||||||
#define PRINT_TENSOR(x) print_tensor(x, #x)
|
#define PRINT_TENSOR(x) print_tensor(x, #x)
|
||||||
|
|
||||||
// Same as above, but additionally computes tensor graph before printing the tensor.
|
// Same as PRINT_TENSOR, but additionally computes tensor graph before printing the tensor.
|
||||||
#define COMPUTE_AND_PRINT_TENSOR(ctx, x) do { compute_graph(ctx, x); print_tensor(x, #x); } while (0)
|
#define COMPUTE_AND_PRINT_TENSOR(ctx, x) do { compute_graph(ctx, x); print_tensor(x, #x); } while (0)
|
||||||
|
|
||||||
// Computes value of the tensor and all tensors it depends on.
|
// Computes value of the tensor and all tensors it depends on.
|
||||||
|
@ -201,7 +185,7 @@ void set_block_parameter(std::unordered_map<std::string, struct ggml_tensor *> *
|
||||||
|
|
||||||
// Loads RWKV model metadata and parameters from a file.
|
// Loads RWKV model metadata and parameters from a file.
|
||||||
void load_rwkv_model(ggml_context * ctx, char * file_path, struct rwkv_model * model) {
|
void load_rwkv_model(ggml_context * ctx, char * file_path, struct rwkv_model * model) {
|
||||||
printf("Loading model from %s\n", file_path);
|
RWKV_LOG("Loading model from %s", file_path);
|
||||||
FILE * file = fopen(file_path, "rb");
|
FILE * file = fopen(file_path, "rb");
|
||||||
RWKV_ASSERT(file != NULL, "Failed to open file %s", file_path);
|
RWKV_ASSERT(file != NULL, "Failed to open file %s", file_path);
|
||||||
|
|
||||||
|
@ -227,9 +211,9 @@ void load_rwkv_model(ggml_context * ctx, char * file_path, struct rwkv_model * m
|
||||||
|
|
||||||
RWKV_ASSERT(model->data_type == 0, "Data types other than float32 are not yet supported"); // TODO
|
RWKV_ASSERT(model->data_type == 0, "Data types other than float32 are not yet supported"); // TODO
|
||||||
|
|
||||||
printf("n_vocab = %d\n", model->n_vocab);
|
RWKV_LOG("n_vocab = %d", model->n_vocab);
|
||||||
printf("n_embed = %d\n", model->n_embed);
|
RWKV_LOG("n_embed = %d", model->n_embed);
|
||||||
printf("n_layer = %d\n", model->n_layer);
|
RWKV_LOG("n_layer = %d", model->n_layer);
|
||||||
|
|
||||||
std::unordered_map<std::string, struct ggml_tensor *> parameters;
|
std::unordered_map<std::string, struct ggml_tensor *> parameters;
|
||||||
|
|
||||||
|
@ -288,7 +272,7 @@ void load_rwkv_model(ggml_context * ctx, char * file_path, struct rwkv_model * m
|
||||||
|
|
||||||
fclose(file);
|
fclose(file);
|
||||||
|
|
||||||
printf("Initializing model parameters\n");
|
RWKV_LOG("Initializing model parameters");
|
||||||
|
|
||||||
model->layers.resize(model->n_layer);
|
model->layers.resize(model->n_layer);
|
||||||
|
|
||||||
|
@ -372,7 +356,7 @@ int main(int argc, char ** argv) {
|
||||||
char * logits_out_path = argv[5];
|
char * logits_out_path = argv[5];
|
||||||
|
|
||||||
int32_t token = strtol(token_s, (char **) NULL, 10);
|
int32_t token = strtol(token_s, (char **) NULL, 10);
|
||||||
printf("Token index is %d\n", token);
|
RWKV_LOG("Token index is %d", token);
|
||||||
|
|
||||||
bool create_new_state = strcmp(state_in_path, "") == 0;
|
bool create_new_state = strcmp(state_in_path, "") == 0;
|
||||||
|
|
||||||
|
@ -399,7 +383,7 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_tensor * state = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, state_element_count);
|
struct ggml_tensor * state = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, state_element_count);
|
||||||
|
|
||||||
if (create_new_state) {
|
if (create_new_state) {
|
||||||
printf("Creating new state\n");
|
RWKV_LOG("Creating new state");
|
||||||
ggml_set_f32(state, 0.0F);
|
ggml_set_f32(state, 0.0F);
|
||||||
|
|
||||||
// TODO Verify correctness
|
// TODO Verify correctness
|
||||||
|
@ -410,7 +394,7 @@ int main(int argc, char ** argv) {
|
||||||
ggml_set_f32(state_part, -1e30F);
|
ggml_set_f32(state_part, -1e30F);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
printf("Loading state from %s\n", state_in_path);
|
RWKV_LOG("Loading state from %s", state_in_path);
|
||||||
int32_t state_file_size = state_element_count * 4;
|
int32_t state_file_size = state_element_count * 4;
|
||||||
|
|
||||||
FILE * state_in_file = fopen(state_in_path, "rb");
|
FILE * state_in_file = fopen(state_in_path, "rb");
|
||||||
|
@ -606,7 +590,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
|
|
||||||
printf("OK\n");
|
RWKV_LOG("OK");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
# // All ints and floats are in machine byte order.
|
# // All ints and floats are in machine byte order.
|
||||||
# // Magic is "ggml" string bytes.
|
# // Magic is "ggml" string bytes.
|
||||||
# int32 magic = 0x67676d66;
|
# int32 magic = 0x67676d66;
|
||||||
# int32 version;
|
# int32 version = 100;
|
||||||
# int32 n_vocab;
|
# int32 n_vocab;
|
||||||
# int32 n_embed;
|
# int32 n_embed;
|
||||||
# int32 n_layer;
|
# int32 n_layer;
|
||||||
|
@ -106,6 +106,7 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
|
||||||
1 if tensor.dtype == torch.float16 else 0
|
1 if tensor.dtype == torch.float16 else 0
|
||||||
))
|
))
|
||||||
|
|
||||||
|
# Note that shape is not reversed here like in llama.cpp!
|
||||||
for dim in tensor.shape:
|
for dim in tensor.shape:
|
||||||
out_file.write(struct.pack('=i', dim))
|
out_file.write(struct.pack('=i', dim))
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue