Chair of Computer Architecture

University Freiburg

This is a quick overview of the GGUF file format, and should contain all information to parse GGUF files. Alternatively, here is also a great documentation.

GGUF File Format

If you encounter something invalid in the file, just throw an error and exit. The definition of the ML model must be sound.

Every gguf file starts like this:

uint32_t magic;
uint32_t version;
uint64_t n_tensors;
uint64_t n_kv;

Key-Value Pairs

There are n_kv key-value pairs. They provide information about the model. A key-value pair looks like this:

struct gguf_kv {
  struct gguf_str key;

  enum gguf_type type;
  union gguf_value value;
};

A gguf_value looks like this:

union gguf_value {
  uint8_t uint8;
  int8_t int8;
  uint16_t uint16;
  int16_t int16;
  uint32_t uint32;
  int32_t int32;
  float float32;
  uint64_t uint64;
  int64_t int64;
  double float64;
  bool bool_;

  struct gguf_str str;

  struct {
    enum gguf_type type;

    uint64_t n;  // GGUFv2
    void *data;
  } arr;
};

You parse everything in succession, i.e., for every key-value pair, your first parse the key, then the type, and lastly the value. Each key-value pair starts with the key, which is a gguf_str:

struct gguf_str {
  uint64_t n;
  char *data;
};

In GGUF files, the terminating NULL character is missing from the strings. Make sure you append it when printing them out, or just allocating more memory for ‘\0’, and then writing to the string, otherwise you will not be able to properly see the content. Then comes the gguf_type:

enum gguf_type {
  GGUF_TYPE_UINT8 = 0,
  GGUF_TYPE_INT8 = 1,
  GGUF_TYPE_UINT16 = 2,
  GGUF_TYPE_INT16 = 3,
  GGUF_TYPE_UINT32 = 4,
  GGUF_TYPE_INT32 = 5,
  GGUF_TYPE_FLOAT32 = 6,
  GGUF_TYPE_BOOL = 7,
  GGUF_TYPE_STRING = 8,
  GGUF_TYPE_ARRAY = 9,
  GGUF_TYPE_UINT64 = 10,
  GGUF_TYPE_INT64 = 11,
  GGUF_TYPE_FLOAT64 = 12,
  GGUF_TYPE_COUNT,  // marks the end of the enum
};

A GGUF_TYPE_UINT32 is just a uint32_t. A GGUF_TYPE_BOOL is a bool, which is available when stdbool.h is included. A GGUF_TYPE_STRING is a gguf_str. A GGUF_TYPE_COUNT is an invalid type. The only thing that is trickier is GGUF_TYPE_ARRAY.

An array is defined like this:

struct {
enum gguf_type type;

uint64_t n;  // GGUFv2
void *data;
} arr;

So you first read in the gguf_type. Then comes the n, i.e., the length of the array. Given the type and length, you can read in the data. The only types that are not valid for an array are GGUF_TYPE_ARRAY and GGUF_TYPE_COUNT. If you encounter general.alignment, save that for later.

A lookup-table for the type sizes might be useful:

static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
    [GGUF_TYPE_UINT8] = sizeof(uint8_t),
    [GGUF_TYPE_INT8] = sizeof(int8_t),
    [GGUF_TYPE_UINT16] = sizeof(uint16_t),
    [GGUF_TYPE_INT16] = sizeof(int16_t),
    [GGUF_TYPE_UINT32] = sizeof(uint32_t),
    [GGUF_TYPE_INT32] = sizeof(int32_t),
    [GGUF_TYPE_FLOAT32] = sizeof(float),
    [GGUF_TYPE_BOOL] = sizeof(bool),
    [GGUF_TYPE_STRING] = sizeof(struct gguf_str),
    [GGUF_TYPE_UINT64] = sizeof(uint64_t),
    [GGUF_TYPE_INT64] = sizeof(int64_t),
    [GGUF_TYPE_FLOAT64] = sizeof(double),
    [GGUF_TYPE_ARRAY] = 0,  // undefined
};

Tensor Info

There are n_tensors tensor infos. A tensor info is defined like this:

#define GGML_MAX_DIMS 4
#define GGUF_DEFAULT_ALIGNMENT 32

struct gguf_tensor_info {
  struct gguf_str name;

  uint32_t n_dims;
  uint64_t ne[GGML_MAX_DIMS];

  enum ggml_type type;

  uint64_t
      offset;  // offset from start of `data`, must be a multiple of `ALIGNMENT`

  // for writing API
  const void *data;
  size_t size;
};

Fill ne with ones. Then comes the name and n_dims. Iterate over n_dims and fill ne with the respective number. Then comes ggml_type and the offset.

Offset

At this point, we might have to adjust the offset. Compute offset % alignment. If that is not zero, add alignment - (offset % alignment) to the offset and re-adjust the file buffer.

Tensor Contents

There are n_tensors to parse. A ggml_tensor has lots of useful information in general, but when reading a tensor from file, we do not need all properties. For a quick explanation how the computational graph is set up, for example, check out ggml.h from the llama.cpp library. A ggml_tensor is defined like this:

struct ggml_tensor {
  enum ggml_type type;
  enum ggml_backend backend;

  int n_dims;
  int64_t ne[GGML_MAX_DIMS];  // number of elements
  size_t nb[GGML_MAX_DIMS];   // stride in bytes:
                              // nb[0] = sizeof(type)
                              // nb[1] = nb[0]   * ne[0] + padding
                              // nb[i] = nb[i-1] * ne[i-1]

  // compute data
  enum ggml_op op;

  // op params - allocated as int32_t for alignment
  int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];

  bool is_param;

  struct ggml_tensor *grad;
  struct ggml_tensor *src[GGML_MAX_SRC];

  // performance
  int perf_runs;
  int64_t perf_cycles;
  int64_t perf_time_us;

  struct ggml_tensor *view_src;
  size_t view_offs;

  void *data;

  char name[GGML_MAX_NAME];

  void *extra;  // extra things e.g. for ggml-cuda.cu

  char padding[4];
};

You may have noticed several new structs that are required:

enum ggml_backend {
  GGML_BACKEND_CPU = 0,
  GGML_BACKEND_GPU = 10,
  GGML_BACKEND_GPU_SPLIT = 20,
};

enum ggml_op {
  GGML_OP_NONE = 0,

  GGML_OP_DUP,
  GGML_OP_ADD,
  GGML_OP_ADD1,
  GGML_OP_ACC,
  GGML_OP_SUB,
  GGML_OP_MUL,
  GGML_OP_DIV,
  GGML_OP_SQR,
  GGML_OP_SQRT,
  GGML_OP_LOG,
  GGML_OP_SUM,
  GGML_OP_SUM_ROWS,
  GGML_OP_MEAN,
  GGML_OP_ARGMAX,
  GGML_OP_REPEAT,
  GGML_OP_REPEAT_BACK,
  GGML_OP_CONCAT,
  GGML_OP_SILU_BACK,
  GGML_OP_NORM,  // normalize
  GGML_OP_RMS_NORM,
  GGML_OP_RMS_NORM_BACK,
  GGML_OP_GROUP_NORM,

  GGML_OP_MUL_MAT,
  GGML_OP_OUT_PROD,

  GGML_OP_SCALE,
  GGML_OP_SET,
  GGML_OP_CPY,
  GGML_OP_CONT,
  GGML_OP_RESHAPE,
  GGML_OP_VIEW,
  GGML_OP_PERMUTE,
  GGML_OP_TRANSPOSE,
  GGML_OP_GET_ROWS,
  GGML_OP_GET_ROWS_BACK,
  GGML_OP_DIAG,
  GGML_OP_DIAG_MASK_INF,
  GGML_OP_DIAG_MASK_ZERO,
  GGML_OP_SOFT_MAX,
  GGML_OP_SOFT_MAX_BACK,
  GGML_OP_ROPE,
  GGML_OP_ROPE_BACK,
  GGML_OP_ALIBI,
  GGML_OP_CLAMP,
  GGML_OP_CONV_1D,
  GGML_OP_CONV_2D,
  GGML_OP_CONV_TRANSPOSE_2D,
  GGML_OP_POOL_1D,
  GGML_OP_POOL_2D,

  GGML_OP_UPSCALE,  // nearest interpolate

  GGML_OP_FLASH_ATTN,
  GGML_OP_FLASH_FF,
  GGML_OP_FLASH_ATTN_BACK,
  GGML_OP_WIN_PART,
  GGML_OP_WIN_UNPART,
  GGML_OP_GET_REL_POS,
  GGML_OP_ADD_REL_POS,

  GGML_OP_UNARY,

  GGML_OP_MAP_UNARY,
  GGML_OP_MAP_BINARY,

  GGML_OP_MAP_CUSTOM1_F32,
  GGML_OP_MAP_CUSTOM2_F32,
  GGML_OP_MAP_CUSTOM3_F32,

  GGML_OP_MAP_CUSTOM1,
  GGML_OP_MAP_CUSTOM2,
  GGML_OP_MAP_CUSTOM3,

  GGML_OP_CROSS_ENTROPY_LOSS,
  GGML_OP_CROSS_ENTROPY_LOSS_BACK,

  GGML_OP_COUNT,
};

#define GGML_MAX_DIMS 4
#define GGML_MAX_OP_PARAMS 32
#define GGML_MAX_NAME 64

For every tensor, the offset is the offset calculated from the previous section, plus the offset from the respective tensor info. The size to read in you can get like this:

typedef struct {
  size_t type_size;
  int blck_size;
} ggml_type_traits_t;

typedef uint16_t ggml_fp16_t;
#define QK4_0 32
typedef struct {
  ggml_fp16_t d;         // delta
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
} block_q4_0;
#define QK4_1 32
typedef struct {
  ggml_fp16_t d;         // delta
  ggml_fp16_t m;         // min
  uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1;
#define QK5_0 32
typedef struct {
  ggml_fp16_t d;         // delta
  uint8_t qh[4];         // 5-th bit of quants
  uint8_t qs[QK5_0 / 2]; // nibbles / quants
} block_q5_0;
#define QK5_1 32
typedef struct {
  ggml_fp16_t d;         // delta
  ggml_fp16_t m;         // min
  uint8_t qh[4];         // 5-th bit of quants
  uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1;
#define QK8_0 32
typedef struct {
  ggml_fp16_t d;    // delta
  int8_t qs[QK8_0]; // quants
} block_q8_0;
#define QK8_1 32
typedef struct {
  float d;          // delta
  float s;          // d * sum(qs[i])
  int8_t qs[QK8_1]; // quants
} block_q8_1;



#define QK_K 256
#define K_SCALE_SIZE 12
//
// Super-block quantization structures
//

// 2-bit quantization
// weight is represented as x = a * q + b
// 16 blocks of 16 elements each
// Effectively 2.5625 bits per weight
typedef struct {
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
    uint8_t qs[QK_K/4];      // quants
    ggml_fp16_t d;           // super-block scale for quantized scales
    ggml_fp16_t dmin;        // super-block scale for quantized mins
} block_q2_K;

// 3-bit quantization
// weight is represented as x = a * q
// 16 blocks of 16 elements each
// Effectively 3.4375 bits per weight
#ifdef GGML_QKK_64
typedef struct {
    uint8_t hmask[QK_K/8];     // quants - high bit
    uint8_t qs[QK_K/4];        // quants - low 2 bits
    uint8_t scales[2];
    ggml_fp16_t d;             // super-block scale
} block_q3_K;
#else
typedef struct {
    uint8_t hmask[QK_K/8];     // quants - high bit
    uint8_t qs[QK_K/4];        // quants - low 2 bits
    uint8_t scales[12];        // scales, quantized with 6 bits
    ggml_fp16_t d;             // super-block scale
} block_q3_K;
#endif

// 4-bit quantization
// 8 blocks of 32 elements each
// weight is represented as x = a * q + b
// Effectively 4.5 bits per weight
#ifdef GGML_QKK_64
typedef struct {
    ggml_fp16_t d[2];          // super-block scales/mins
    uint8_t scales[2];         // 4-bit block scales/mins
    uint8_t qs[QK_K/2];        // 4--bit quants
} block_q4_K;
#else
typedef struct {
    ggml_fp16_t d;             // super-block scale for quantized scales
    ggml_fp16_t dmin;          // super-block scale for quantized mins
    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
    uint8_t qs[QK_K/2];        // 4--bit quants
} block_q4_K;
#endif

// 5-bit quantization
// 8 blocks of 32 elements each
// weight is represented as x = a * q + b
// Effectively 5.5 bits per weight
#ifdef GGML_QKK_64
typedef struct {
    ggml_fp16_t d;               // super-block scale
    int8_t  scales[QK_K/16];     // 8-bit block scales
    uint8_t qh[QK_K/8];          // quants, high bit
    uint8_t qs[QK_K/2];          // quants, low 4 bits
} block_q5_K;
#else
typedef struct {
    ggml_fp16_t d;               // super-block scale for quantized scales
    ggml_fp16_t dmin;            // super-block scale for quantized mins
    uint8_t scales[K_SCALE_SIZE];   // scales and mins, quantized with 6 bits
    uint8_t qh[QK_K/8];          // quants, high bit
    uint8_t qs[QK_K/2];          // quants, low 4 bits
} block_q5_K;
#endif

// 6-bit quantization
// weight is represented as x = a * q
// 16 blocks of 16 elements each
// Effectively 6.5625 bits per weight
typedef struct {
    uint8_t ql[QK_K/2];      // quants, lower 4 bits
    uint8_t qh[QK_K/4];      // quants, upper 2 bits
    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
    ggml_fp16_t d;           // super-block scale
} block_q6_K;

// This is only used for intermediate quantization and dot products
typedef struct {
    float   d;              // delta
    int8_t  qs[QK_K];       // quants
    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
} block_q8_K;

static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
    [GGML_TYPE_I8] =
        {
            .type_size = sizeof(int8_t),
            .blck_size = 1,
        },
    [GGML_TYPE_I16] =
        {
            .type_size = sizeof(int16_t),
            .blck_size = 1,
        },
    [GGML_TYPE_I32] =
        {
            .type_size = sizeof(int32_t),
            .blck_size = 1,
        },
    [GGML_TYPE_F32] =
        {
            .type_size = sizeof(float),
            .blck_size = 1,
        },
    [GGML_TYPE_F16] =
        {
            .type_size = sizeof(ggml_fp16_t),
            .blck_size = 1,
        },
    [GGML_TYPE_Q4_0] =
        {
            .type_size = sizeof(block_q4_0),
            .blck_size = QK4_0,
        },
    [GGML_TYPE_Q4_1] =
        {
            .type_size = sizeof(block_q4_1),
            .blck_size = QK4_1,
        },
    [GGML_TYPE_Q5_0] =
        {
            .type_size = sizeof(block_q5_0),
            .blck_size = QK5_0,
        },
    [GGML_TYPE_Q5_1] =
        {
            .type_size = sizeof(block_q5_1),
            .blck_size = QK5_1,
        },
    [GGML_TYPE_Q8_0] =
        {
            .type_size = sizeof(block_q8_0),
            .blck_size = QK8_0,
        },
    [GGML_TYPE_Q8_1] =
        {
            .type_size = sizeof(block_q8_1),
            .blck_size = QK8_1,
        },
#ifdef GGML_USE_K_QUANTS
    [GGML_TYPE_Q2_K] =
        {
            .type_size = sizeof(block_q2_K),
            .blck_size = QK_K,
        },
    [GGML_TYPE_Q3_K] =
        {
            .type_size = sizeof(block_q3_K),
            .blck_size = QK_K,
        },
    [GGML_TYPE_Q4_K] =
        {
            .type_size = sizeof(block_q4_K),
            .blck_size = QK_K,
        },
    [GGML_TYPE_Q5_K] =
        {
            .type_size = sizeof(block_q5_K),
            .blck_size = QK_K,
        },
    [GGML_TYPE_Q6_K] =
        {
            .type_size = sizeof(block_q6_K),
            .blck_size = QK_K,
        },
    [GGML_TYPE_Q8_K] =
        {
            .type_size = sizeof(block_q8_K),
            .blck_size = QK_K,
        },
#endif
};

int ggml_blck_size(enum ggml_type type) { return type_traits[type].blck_size; }

size_t ggml_type_size(enum ggml_type type) {
  return type_traits[type].type_size;
}

size_t data_size =
  ggml_type_size(info.type) * (info.ne[0] / ggml_blck_size(info.type));
for (uint32_t i = 1; i < info.n_dims; i++) {
  data_size *= info.ne[i];
  }

Then create the tensor:

struct ggml_tensor result = {
  .type = info.type,
  .backend = GGML_BACKEND_CPU,
  .n_dims = info.n_dims,
  .ne = {1, 1, 1, 1}, // added braces
  .nb = {0, 0, 0, 0}, // added braces
  .op = GGML_OP_NONE,
  .op_params = {0}, // added braces
  .is_param = false,
  .grad = NULL,
  .src = {NULL}, // added braces
  .perf_runs = 0,
  .perf_cycles = 0,
  .perf_time_us = 0,
  .view_src = NULL,
  .view_offs = 0, // changed from NULL to 0
  .data = data,
  .name = {0}, // added braces
  .extra = NULL,
  .padding = {0} // added braces
};
for (uint32_t i = 0; i < info.n_dims; i++) {
  result.ne[i] = info.ne[i];
}

The variable ne denotes the number of elements in each dimension, and nb denotes the number of bytes.

Checking Correctness of Tensor Parsing

To make sure you implemented everything correctly, you might want to dump the contents of each tensor of a model you are reading in. Then compare the dumps with ones created with llama.cpp. In the file llama.cpp, put something like

dump_memory_to_file(cur->data, ggml_nbytes(cur));

in the function load_data_for. Here dump_memory_to_file has a fixed dumping folder, and names the files from 000.bin upward, given the folder contents. Then compile and trigger the function by calling

./quantize --allow-requantize <model_file> COPY

Now you have the dumps from llama.cpp. Run

cmp mycode/dump/<num>.bin llama.cpp/dump/<num>.bin

for all files. If cmp returns without printing anything, you have done everything correctly.

A possible implementation of dump_memory_to_file could look like this:

void dump_memory_to_file(void *ptr, size_t size) {
    const char *foldername = "dump";
    struct stat st = {0};

    // Check if 'dump' directory exists, if not create it
    if (stat(foldername, &st) == -1) {
        mkdir(foldername, 0700);
    }

    DIR *dir = opendir(foldername);
    if (!dir) {
        perror("Error opening directory");
        return;
    }

    int maxNumber = -1;
    struct dirent *entry;
    while ((entry = readdir(dir)) != NULL) {
        if (strstr(entry->d_name, ".bin")) {
            int num;
            if (sscanf(entry->d_name, "%d.bin", &num) == 1) {
                if (num > maxNumber) {
                    maxNumber = num;
                }
            }
        }
    }
    closedir(dir);

    // Construct the filename
    char filename[256];
    snprintf(filename, sizeof(filename), "%s/%03d.bin", foldername, maxNumber + 1);

    FILE *file = fopen(filename, "wb");
    if (!file) {
        perror("Error opening file");
        return;
    }

    uint8_t *byte_ptr = (uint8_t *)ptr;
    for (size_t i = 0; i < size; ++i) {
        fwrite(&byte_ptr[i], sizeof(uint8_t), 1, file);
    }

    fclose(file);
}

GGUF Context

Having everything dangle around separately is cumbersome, so llama.cpp uses a context to keep track of everything parsed:

struct gguf_context {
  struct gguf_header header;

  struct gguf_kv *kv;
  struct gguf_tensor_info *infos;

  size_t alignment;
  size_t offset;  // offset of `data` from beginning of file
  size_t size;    // size of `data` in bytes

  // uint8_t * padding;
  void *data;
};

Using a context, you can free the allocated memory very easily.

Change your code such that you put everything in this context. Unlike in the previous sections, the tensor data is in a single blob in the gguf_context. The total size of the data section you can compute like this (taken from ggml.c):

// compute the total size of the data section, taking into account the
// alignment
{
  ctx.size = 0;
  for (uint32_t i = 0; i < ctx.header.n_tensors; ++i) {
    struct gguf_tensor_info *info = &ctx.infos[i];

    const int64_t ne = (int64_t)info->ne[0] * (int64_t)info->ne[1] *
                       (int64_t)info->ne[2] * (int64_t)info->ne[3];

    if (ne % ggml_blck_size(info->type) != 0) {
      fprintf(stderr,
              "%s: tensor '%s' number of elements (%" PRId64
              ") is not a multiple of block size (%d)\n",
              __func__, info->name.data, ne, ggml_blck_size(info->type));
      fclose(file);
      gguf_free(&ctx);
      exit(1);
    }

    const size_t size_cur =
        (ne * ggml_type_size(info->type)) / ggml_blck_size(info->type);

    ctx.size += GGML_PAD(size_cur, ctx.alignment);
  }
}

GGML_PAD is defined like this:

#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))