This is a quick overview of the GGUF file format, and should contain all information to parse GGUF files. Alternatively, here is also a great documentation.
GGUF File Format
If you encounter something invalid in the file, just throw an error and exit. The definition of the ML model must be sound.
Header
Every gguf file starts like this:
uint32_t magic;
uint32_t version;
uint64_t n_tensors;
uint64_t n_kv;
Key-Value Pairs
There are n_kv
key-value pairs. They provide information about the model. A key-value pair looks like this:
struct gguf_kv {
struct gguf_str key;
enum gguf_type type;
union gguf_value value;
};
A gguf_value
looks like this:
union gguf_value {
uint8_t uint8;
int8_t int8;
uint16_t uint16;
int16_t int16;
uint32_t uint32;
int32_t int32;
float float32;
uint64_t uint64;
int64_t int64;
double float64;
bool bool_;
struct gguf_str str;
struct {
enum gguf_type type;
uint64_t n; // GGUFv2
void *data;
} arr;
};
You parse everything in succession, i.e., for every key-value pair, your first parse the key, then the type, and lastly the value. Each key-value pair starts with the key, which is a gguf_str
:
struct gguf_str {
uint64_t n;
char *data;
};
In GGUF files, the terminating NULL character is missing from the strings. Make sure you append it when printing them out, or just allocating more memory for ‘\0’, and then writing to the string, otherwise you will not be able to properly see the content. Then comes the gguf_type
:
enum gguf_type {
GGUF_TYPE_UINT8 = 0,
GGUF_TYPE_INT8 = 1,
GGUF_TYPE_UINT16 = 2,
GGUF_TYPE_INT16 = 3,
GGUF_TYPE_UINT32 = 4,
GGUF_TYPE_INT32 = 5,
GGUF_TYPE_FLOAT32 = 6,
GGUF_TYPE_BOOL = 7,
GGUF_TYPE_STRING = 8,
GGUF_TYPE_ARRAY = 9,
GGUF_TYPE_UINT64 = 10,
GGUF_TYPE_INT64 = 11,
GGUF_TYPE_FLOAT64 = 12,
GGUF_TYPE_COUNT, // marks the end of the enum
};
A GGUF_TYPE_UINT32
is just a uint32_t
. A GGUF_TYPE_BOOL
is a bool
, which is available when stdbool.h
is included. A GGUF_TYPE_STRING
is a gguf_str
. A GGUF_TYPE_COUNT
is an invalid type. The only thing that is trickier is GGUF_TYPE_ARRAY
.
An array is defined like this:
struct {
enum gguf_type type;
uint64_t n; // GGUFv2
void *data;
} arr;
So you first read in the gguf_type
. Then comes the n
, i.e., the length of the array. Given the type and length, you can read in the data. The only types that are not valid for an array are GGUF_TYPE_ARRAY
and GGUF_TYPE_COUNT
. If you encounter general.alignment
, save that for later.
A lookup-table for the type sizes might be useful:
static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
[GGUF_TYPE_UINT8] = sizeof(uint8_t),
[GGUF_TYPE_INT8] = sizeof(int8_t),
[GGUF_TYPE_UINT16] = sizeof(uint16_t),
[GGUF_TYPE_INT16] = sizeof(int16_t),
[GGUF_TYPE_UINT32] = sizeof(uint32_t),
[GGUF_TYPE_INT32] = sizeof(int32_t),
[GGUF_TYPE_FLOAT32] = sizeof(float),
[GGUF_TYPE_BOOL] = sizeof(bool),
[GGUF_TYPE_STRING] = sizeof(struct gguf_str),
[GGUF_TYPE_UINT64] = sizeof(uint64_t),
[GGUF_TYPE_INT64] = sizeof(int64_t),
[GGUF_TYPE_FLOAT64] = sizeof(double),
[GGUF_TYPE_ARRAY] = 0, // undefined
};
Tensor Info
There are n_tensors
tensor infos. A tensor info is defined like this:
#define GGML_MAX_DIMS 4
#define GGUF_DEFAULT_ALIGNMENT 32
struct gguf_tensor_info {
struct gguf_str name;
uint32_t n_dims;
uint64_t ne[GGML_MAX_DIMS];
enum ggml_type type;
uint64_t
offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
// for writing API
const void *data;
size_t size;
};
Fill ne
with ones. Then comes the name
and n_dims
. Iterate over n_dims
and fill ne
with the respective number. Then comes ggml_type
and the offset
.
Offset
At this point, we might have to adjust the offset. Compute offset % alignment
. If that is not zero, add alignment - (offset % alignment)
to the offset and re-adjust the file buffer.
Tensor Contents
There are n_tensors
to parse. A ggml_tensor
has lots of useful information in general, but when reading a tensor from file, we do not need all properties. For a quick explanation how the computational graph is set up, for example, check out ggml.h
from the llama.cpp
library. A ggml_tensor
is defined like this:
struct ggml_tensor {
enum ggml_type type;
enum ggml_backend backend;
int n_dims;
int64_t ne[GGML_MAX_DIMS]; // number of elements
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
// nb[0] = sizeof(type)
// nb[1] = nb[0] * ne[0] + padding
// nb[i] = nb[i-1] * ne[i-1]
// compute data
enum ggml_op op;
// op params - allocated as int32_t for alignment
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
bool is_param;
struct ggml_tensor *grad;
struct ggml_tensor *src[GGML_MAX_SRC];
// performance
int perf_runs;
int64_t perf_cycles;
int64_t perf_time_us;
struct ggml_tensor *view_src;
size_t view_offs;
void *data;
char name[GGML_MAX_NAME];
void *extra; // extra things e.g. for ggml-cuda.cu
char padding[4];
};
You may have noticed several new structs that are required:
enum ggml_backend {
GGML_BACKEND_CPU = 0,
GGML_BACKEND_GPU = 10,
GGML_BACKEND_GPU_SPLIT = 20,
};
enum ggml_op {
GGML_OP_NONE = 0,
GGML_OP_DUP,
GGML_OP_ADD,
GGML_OP_ADD1,
GGML_OP_ACC,
GGML_OP_SUB,
GGML_OP_MUL,
GGML_OP_DIV,
GGML_OP_SQR,
GGML_OP_SQRT,
GGML_OP_LOG,
GGML_OP_SUM,
GGML_OP_SUM_ROWS,
GGML_OP_MEAN,
GGML_OP_ARGMAX,
GGML_OP_REPEAT,
GGML_OP_REPEAT_BACK,
GGML_OP_CONCAT,
GGML_OP_SILU_BACK,
GGML_OP_NORM, // normalize
GGML_OP_RMS_NORM,
GGML_OP_RMS_NORM_BACK,
GGML_OP_GROUP_NORM,
GGML_OP_MUL_MAT,
GGML_OP_OUT_PROD,
GGML_OP_SCALE,
GGML_OP_SET,
GGML_OP_CPY,
GGML_OP_CONT,
GGML_OP_RESHAPE,
GGML_OP_VIEW,
GGML_OP_PERMUTE,
GGML_OP_TRANSPOSE,
GGML_OP_GET_ROWS,
GGML_OP_GET_ROWS_BACK,
GGML_OP_DIAG,
GGML_OP_DIAG_MASK_INF,
GGML_OP_DIAG_MASK_ZERO,
GGML_OP_SOFT_MAX,
GGML_OP_SOFT_MAX_BACK,
GGML_OP_ROPE,
GGML_OP_ROPE_BACK,
GGML_OP_ALIBI,
GGML_OP_CLAMP,
GGML_OP_CONV_1D,
GGML_OP_CONV_2D,
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_FLASH_ATTN,
GGML_OP_FLASH_FF,
GGML_OP_FLASH_ATTN_BACK,
GGML_OP_WIN_PART,
GGML_OP_WIN_UNPART,
GGML_OP_GET_REL_POS,
GGML_OP_ADD_REL_POS,
GGML_OP_UNARY,
GGML_OP_MAP_UNARY,
GGML_OP_MAP_BINARY,
GGML_OP_MAP_CUSTOM1_F32,
GGML_OP_MAP_CUSTOM2_F32,
GGML_OP_MAP_CUSTOM3_F32,
GGML_OP_MAP_CUSTOM1,
GGML_OP_MAP_CUSTOM2,
GGML_OP_MAP_CUSTOM3,
GGML_OP_CROSS_ENTROPY_LOSS,
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
GGML_OP_COUNT,
};
#define GGML_MAX_DIMS 4
#define GGML_MAX_OP_PARAMS 32
#define GGML_MAX_NAME 64
For every tensor, the offset is the offset calculated from the previous section, plus the offset from the respective tensor info. The size to read in you can get like this:
typedef struct {
size_t type_size;
int blck_size;
} ggml_type_traits_t;
typedef uint16_t ggml_fp16_t;
#define QK4_0 32
typedef struct {
ggml_fp16_t d; // delta
uint8_t qs[QK4_0 / 2]; // nibbles / quants
} block_q4_0;
#define QK4_1 32
typedef struct {
ggml_fp16_t d; // delta
ggml_fp16_t m; // min
uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1;
#define QK5_0 32
typedef struct {
ggml_fp16_t d; // delta
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_0 / 2]; // nibbles / quants
} block_q5_0;
#define QK5_1 32
typedef struct {
ggml_fp16_t d; // delta
ggml_fp16_t m; // min
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1;
#define QK8_0 32
typedef struct {
ggml_fp16_t d; // delta
int8_t qs[QK8_0]; // quants
} block_q8_0;
#define QK8_1 32
typedef struct {
float d; // delta
float s; // d * sum(qs[i])
int8_t qs[QK8_1]; // quants
} block_q8_1;
#define QK_K 256
#define K_SCALE_SIZE 12
//
// Super-block quantization structures
//
// 2-bit quantization
// weight is represented as x = a * q + b
// 16 blocks of 16 elements each
// Effectively 2.5625 bits per weight
typedef struct {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
ggml_fp16_t d; // super-block scale for quantized scales
ggml_fp16_t dmin; // super-block scale for quantized mins
} block_q2_K;
// 3-bit quantization
// weight is represented as x = a * q
// 16 blocks of 16 elements each
// Effectively 3.4375 bits per weight
#ifdef GGML_QKK_64
typedef struct {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
uint8_t scales[2];
ggml_fp16_t d; // super-block scale
} block_q3_K;
#else
typedef struct {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
uint8_t scales[12]; // scales, quantized with 6 bits
ggml_fp16_t d; // super-block scale
} block_q3_K;
#endif
// 4-bit quantization
// 8 blocks of 32 elements each
// weight is represented as x = a * q + b
// Effectively 4.5 bits per weight
#ifdef GGML_QKK_64
typedef struct {
ggml_fp16_t d[2]; // super-block scales/mins
uint8_t scales[2]; // 4-bit block scales/mins
uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K;
#else
typedef struct {
ggml_fp16_t d; // super-block scale for quantized scales
ggml_fp16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K;
#endif
// 5-bit quantization
// 8 blocks of 32 elements each
// weight is represented as x = a * q + b
// Effectively 5.5 bits per weight
#ifdef GGML_QKK_64
typedef struct {
ggml_fp16_t d; // super-block scale
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
} block_q5_K;
#else
typedef struct {
ggml_fp16_t d; // super-block scale for quantized scales
ggml_fp16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
} block_q5_K;
#endif
// 6-bit quantization
// weight is represented as x = a * q
// 16 blocks of 16 elements each
// Effectively 6.5625 bits per weight
typedef struct {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
ggml_fp16_t d; // super-block scale
} block_q6_K;
// This is only used for intermediate quantization and dot products
typedef struct {
float d; // delta
int8_t qs[QK_K]; // quants
int16_t bsums[QK_K/16]; // sum of quants in groups of 16
} block_q8_K;
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
[GGML_TYPE_I8] =
{
.type_size = sizeof(int8_t),
.blck_size = 1,
},
[GGML_TYPE_I16] =
{
.type_size = sizeof(int16_t),
.blck_size = 1,
},
[GGML_TYPE_I32] =
{
.type_size = sizeof(int32_t),
.blck_size = 1,
},
[GGML_TYPE_F32] =
{
.type_size = sizeof(float),
.blck_size = 1,
},
[GGML_TYPE_F16] =
{
.type_size = sizeof(ggml_fp16_t),
.blck_size = 1,
},
[GGML_TYPE_Q4_0] =
{
.type_size = sizeof(block_q4_0),
.blck_size = QK4_0,
},
[GGML_TYPE_Q4_1] =
{
.type_size = sizeof(block_q4_1),
.blck_size = QK4_1,
},
[GGML_TYPE_Q5_0] =
{
.type_size = sizeof(block_q5_0),
.blck_size = QK5_0,
},
[GGML_TYPE_Q5_1] =
{
.type_size = sizeof(block_q5_1),
.blck_size = QK5_1,
},
[GGML_TYPE_Q8_0] =
{
.type_size = sizeof(block_q8_0),
.blck_size = QK8_0,
},
[GGML_TYPE_Q8_1] =
{
.type_size = sizeof(block_q8_1),
.blck_size = QK8_1,
},
#ifdef GGML_USE_K_QUANTS
[GGML_TYPE_Q2_K] =
{
.type_size = sizeof(block_q2_K),
.blck_size = QK_K,
},
[GGML_TYPE_Q3_K] =
{
.type_size = sizeof(block_q3_K),
.blck_size = QK_K,
},
[GGML_TYPE_Q4_K] =
{
.type_size = sizeof(block_q4_K),
.blck_size = QK_K,
},
[GGML_TYPE_Q5_K] =
{
.type_size = sizeof(block_q5_K),
.blck_size = QK_K,
},
[GGML_TYPE_Q6_K] =
{
.type_size = sizeof(block_q6_K),
.blck_size = QK_K,
},
[GGML_TYPE_Q8_K] =
{
.type_size = sizeof(block_q8_K),
.blck_size = QK_K,
},
#endif
};
int ggml_blck_size(enum ggml_type type) { return type_traits[type].blck_size; }
size_t ggml_type_size(enum ggml_type type) {
return type_traits[type].type_size;
}
size_t data_size =
ggml_type_size(info.type) * (info.ne[0] / ggml_blck_size(info.type));
for (uint32_t i = 1; i < info.n_dims; i++) {
data_size *= info.ne[i];
}
Then create the tensor:
struct ggml_tensor result = {
.type = info.type,
.backend = GGML_BACKEND_CPU,
.n_dims = info.n_dims,
.ne = {1, 1, 1, 1}, // added braces
.nb = {0, 0, 0, 0}, // added braces
.op = GGML_OP_NONE,
.op_params = {0}, // added braces
.is_param = false,
.grad = NULL,
.src = {NULL}, // added braces
.perf_runs = 0,
.perf_cycles = 0,
.perf_time_us = 0,
.view_src = NULL,
.view_offs = 0, // changed from NULL to 0
.data = data,
.name = {0}, // added braces
.extra = NULL,
.padding = {0} // added braces
};
for (uint32_t i = 0; i < info.n_dims; i++) {
result.ne[i] = info.ne[i];
}
The variable ne
denotes the number of elements in each dimension, and nb
denotes the number of bytes.
Checking Correctness of Tensor Parsing
To make sure you implemented everything correctly, you might want to dump the contents of each tensor of a model you are reading in. Then compare the dumps with ones created with llama.cpp
. In the file llama.cpp
, put something like
dump_memory_to_file(cur->data, ggml_nbytes(cur));
in the function load_data_for
. Here dump_memory_to_file
has a fixed dumping folder, and names the files from 000.bin
upward, given the folder contents. Then compile and trigger the function by calling
./quantize --allow-requantize <model_file> COPY
Now you have the dumps from llama.cpp
. Run
cmp mycode/dump/<num>.bin llama.cpp/dump/<num>.bin
for all files. If cmp
returns without printing anything, you have done everything correctly.
A possible implementation of dump_memory_to_file
could look like this:
void dump_memory_to_file(void *ptr, size_t size) {
const char *foldername = "dump";
struct stat st = {0};
// Check if 'dump' directory exists, if not create it
if (stat(foldername, &st) == -1) {
mkdir(foldername, 0700);
}
DIR *dir = opendir(foldername);
if (!dir) {
perror("Error opening directory");
return;
}
int maxNumber = -1;
struct dirent *entry;
while ((entry = readdir(dir)) != NULL) {
if (strstr(entry->d_name, ".bin")) {
int num;
if (sscanf(entry->d_name, "%d.bin", &num) == 1) {
if (num > maxNumber) {
maxNumber = num;
}
}
}
}
closedir(dir);
// Construct the filename
char filename[256];
snprintf(filename, sizeof(filename), "%s/%03d.bin", foldername, maxNumber + 1);
FILE *file = fopen(filename, "wb");
if (!file) {
perror("Error opening file");
return;
}
uint8_t *byte_ptr = (uint8_t *)ptr;
for (size_t i = 0; i < size; ++i) {
fwrite(&byte_ptr[i], sizeof(uint8_t), 1, file);
}
fclose(file);
}
GGUF Context
Having everything dangle around separately is cumbersome, so llama.cpp
uses a context to keep track of everything parsed:
struct gguf_context {
struct gguf_header header;
struct gguf_kv *kv;
struct gguf_tensor_info *infos;
size_t alignment;
size_t offset; // offset of `data` from beginning of file
size_t size; // size of `data` in bytes
// uint8_t * padding;
void *data;
};
Using a context, you can free the allocated memory very easily.
Change your code such that you put everything in this context. Unlike in the previous sections, the tensor data is in a single blob in the gguf_context
. The total size of the data section you can compute like this (taken from ggml.c
):
// compute the total size of the data section, taking into account the
// alignment
{
ctx.size = 0;
for (uint32_t i = 0; i < ctx.header.n_tensors; ++i) {
struct gguf_tensor_info *info = &ctx.infos[i];
const int64_t ne = (int64_t)info->ne[0] * (int64_t)info->ne[1] *
(int64_t)info->ne[2] * (int64_t)info->ne[3];
if (ne % ggml_blck_size(info->type) != 0) {
fprintf(stderr,
"%s: tensor '%s' number of elements (%" PRId64
") is not a multiple of block size (%d)\n",
__func__, info->name.data, ne, ggml_blck_size(info->type));
fclose(file);
gguf_free(&ctx);
exit(1);
}
const size_t size_cur =
(ne * ggml_type_size(info->type)) / ggml_blck_size(info->type);
ctx.size += GGML_PAD(size_cur, ctx.alignment);
}
}
GGML_PAD
is defined like this:
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))