diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index d6807b6dd47a..ac133665d978 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -429,7 +429,8 @@ extern "C" { GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block) GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale) GGML_TYPE_Q1_0 = 41, - GGML_TYPE_COUNT = 42, + GGML_TYPE_Q2_0 = 42, + GGML_TYPE_COUNT = 43, }; // precision @@ -473,6 +474,7 @@ extern "C" { GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors GGML_FTYPE_MOSTLY_Q1_0 = 27, // except 1d tensors + GGML_FTYPE_MOSTLY_Q2_0 = 28, // except 1d tensors }; // available tensor operations: diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index f05683b44cd9..29028b32a2fd 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -96,6 +96,9 @@ typedef sycl::half2 ggml_half2; #define QI1_0 (QK1_0 / 32) #define QR1_0 1 +#define QI2_0 (QK2_0 / 32) +#define QR2_0 1 + #define QI4_0 (QK4_0 / (4 * QR4_0)) #define QR4_0 2 @@ -181,6 +184,13 @@ typedef struct { } block_q1_0; static_assert(sizeof(block_q1_0) == sizeof(ggml_half) + QK1_0 / 8, "wrong q1_0 block size/padding"); +#define QK2_0 64 +typedef struct { + ggml_half d; // delta (scale) + uint8_t qs[QK2_0 / 4]; // 2 bits per element +} block_q2_0; +static_assert(sizeof(block_q2_0) == sizeof(ggml_half) + QK2_0 / 4, "wrong q2_0 block size/padding"); + #define QK4_0 32 typedef struct { ggml_half d; // delta diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index b0391a67c88d..7aeacfdd5b28 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -17,6 +17,7 @@ #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 #define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0 +#define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K @@ -83,6 +84,7 @@ #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64) // quants.c #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 +#define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 @@ -114,6 +116,7 @@ #define quantize_row_q8_K_generic quantize_row_q8_K #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 #define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0 +#define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K @@ -163,6 +166,7 @@ #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 #define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0 +#define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 @@ -203,6 +207,7 @@ #elif defined(__riscv) // quants.c #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 +#define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 @@ -244,6 +249,7 @@ #define quantize_row_q8_K_generic quantize_row_q8_K #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 #define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0 +#define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K @@ -308,6 +314,7 @@ #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 #define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0 +#define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index fe6213329708..9faa4a014193 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -219,6 +219,80 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi #endif } +void ggml_vec_dot_q2_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK2_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q2_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + float sumf = 0.0f; + +#if defined(__ARM_NEON) + // Replicate pattern: each byte repeated 4 times + static const uint8_t tbl_idx_lo[16] = {0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3}; + static const uint8_t tbl_idx_hi[16] = {4,4,4,4, 5,5,5,5, 6,6,6,6, 7,7,7,7}; + // Right-shift amounts: 0,2,4,6 repeated for each group of 4 + static const int8_t shift_vals[16] = {0,-2,-4,-6, 0,-2,-4,-6, 0,-2,-4,-6, 0,-2,-4,-6}; + + const uint8x16_t idx_lo = vld1q_u8(tbl_idx_lo); + const uint8x16_t idx_hi = vld1q_u8(tbl_idx_hi); + const int8x16_t shifts = vld1q_s8(shift_vals); + const uint8x16_t mask2 = vdupq_n_u8(0x03); + const int8x16_t one = vdupq_n_s8(1); + + float32x4_t sumv = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i++) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d); + + // group 64: one Q2_0 block (64 weights) maps to two Q8_0 blocks (2 * 32 = 64) + for (int k = 0; k < 2; k++) { + const block_q8_0 * GGML_RESTRICT yb = &y[i * 2 + k]; + const float d1 = GGML_CPU_FP16_TO_FP32(yb->d); + + // Load 8 bytes of packed 2-bit values + const uint8x8_t raw = vld1_u8(&x[i].qs[k * 8]); + const uint8x16_t raw16 = vcombine_u8(raw, raw); + + // First 16 elements: replicate bytes 0-3, shift, mask, subtract 1 + uint8x16_t bytes0 = vqtbl1q_u8(raw16, idx_lo); + int8x16_t qv0 = vsubq_s8( + vreinterpretq_s8_u8(vandq_u8(vshlq_u8(bytes0, shifts), mask2)), + one); + + // Second 16 elements: replicate bytes 4-7, shift, mask, subtract 1 + uint8x16_t bytes1 = vqtbl1q_u8(raw16, idx_hi); + int8x16_t qv1 = vsubq_s8( + vreinterpretq_s8_u8(vandq_u8(vshlq_u8(bytes1, shifts), mask2)), + one); + + // Load Q8_0 values and dot product + const int8x16_t y0 = vld1q_s8(yb->qs); + const int8x16_t y1 = vld1q_s8(yb->qs + 16); + + int32x4_t p0 = ggml_vdotq_s32(vdupq_n_s32(0), qv0, y0); + int32x4_t p1 = ggml_vdotq_s32(p0, qv1, y1); + + sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(p1), d0 * d1); + } + } + + sumf = vaddvq_f32(sumv); +#else + ggml_vec_dot_q2_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + return; +#endif + + *s = sumf; +} void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index eb8341c9aecc..15c31fa01ec3 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -227,6 +227,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, }, + [GGML_TYPE_Q2_0] = { + .from_float = quantize_row_q2_0, + .vec_dot = ggml_vec_dot_q2_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, [GGML_TYPE_Q4_0] = { .from_float = quantize_row_q4_0, .vec_dot = ggml_vec_dot_q4_0_q8_0, diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 74611dce7f1a..6ab3fd24c3e4 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -665,6 +665,7 @@ void ggml_compute_forward_add( ggml_compute_forward_add_non_quantized(params, dst); } break; case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -1115,6 +1116,7 @@ void ggml_compute_forward_add1( } } break; case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -1245,6 +1247,7 @@ void ggml_compute_forward_acc( case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -4415,6 +4418,7 @@ void ggml_compute_forward_out_prod( switch (src0->type) { case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -4691,6 +4695,7 @@ void ggml_compute_forward_set( case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -4915,6 +4920,7 @@ void ggml_compute_forward_get_rows( switch (src0->type) { case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -5641,6 +5647,7 @@ void ggml_compute_forward_clamp( } break; case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index e5f9a4083f9c..5e36459f8cbc 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -26,6 +26,10 @@ void quantize_row_q1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in quantize_row_q1_0_ref(x, y, k); } +void quantize_row_q2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + quantize_row_q2_0_ref(x, y, k); +} + void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { quantize_row_q4_0_ref(x, y, k); } @@ -170,6 +174,53 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } +void ggml_vec_dot_q2_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK2_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q2_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + float sumf = 0.0f; + + for (int i = 0; i < nb; i++) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d); + + float sumi = 0.0f; + + // group 64: one Q2_0 block (64 weights) maps to two Q8_0 blocks (2 * 32 = 64) + for (int k = 0; k < 2; k++) { + const block_q8_0 * GGML_RESTRICT yb = &y[i * 2 + k]; + const float d1 = GGML_CPU_FP16_TO_FP32(yb->d); + int sumi_block = 0; + + const uint8_t * GGML_RESTRICT qs = &x[i].qs[k * 8]; + const int8_t * GGML_RESTRICT qy = yb->qs; + + for (int b = 0; b < 8; ++b) { + const uint8_t byte = qs[b]; + // Extract 4 two-bit values, map {0,1,2,3} -> {-1,0,1,2} + sumi_block += ((int)((byte >> 0) & 3) - 1) * qy[b*4 + 0]; + sumi_block += ((int)((byte >> 2) & 3) - 1) * qy[b*4 + 1]; + sumi_block += ((int)((byte >> 4) & 3) - 1) * qy[b*4 + 2]; + sumi_block += ((int)((byte >> 6) & 3) - 1) * qy[b*4 + 3]; + } + + sumi += d1 * sumi_block; + } + + sumf += d0 * sumi; + } + + *s = sumf; +} void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h index d4bc87a1c052..93ea7eeffe5b 100644 --- a/ggml/src/ggml-cpu/quants.h +++ b/ggml/src/ggml-cpu/quants.h @@ -13,6 +13,7 @@ extern "C" { // Quantization void quantize_row_q1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); @@ -38,6 +39,7 @@ void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, // Dot product void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q2_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -71,6 +73,7 @@ void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRI void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q2_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 15d231f70c0d..1ebc50a763f1 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -71,6 +71,44 @@ void quantize_row_q1_0_ref(const float * GGML_RESTRICT x, block_q1_0 * GGML_REST } } +void quantize_row_q2_0_ref(const float * GGML_RESTRICT x, block_q2_0 * GGML_RESTRICT y, int64_t k) { + static const int qk = QK2_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + // Compute scale as max absolute value in the block + float amax = 0.0f; + for (int j = 0; j < qk; j++) { + const float a = fabsf(x[i*qk + j]); + if (a > amax) amax = a; + } + const float d = amax; + const float id = d > 0.0f ? 1.0f / d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + // Clear quant bytes + for (int j = 0; j < qk / 4; ++j) { + y[i].qs[j] = 0; + } + + // Encode 2-bit values: round(w/d) clamped to [-1, 2], then add 1 + // 00 (-1) = -scale, 01 (0) = 0, 10 (+1) = +scale, 11 (+2) = 2*scale + for (int j = 0; j < qk; ++j) { + const float w = x[i*qk + j]; + int q = (int)roundf(w * id) + 1; + if (q < 0) q = 0; + if (q > 3) q = 3; + const int byte_index = j / 4; + const int bit_offset = (j % 4) * 2; + y[i].qs[byte_index] |= ((uint8_t)q << bit_offset); + } + } +} + // reference implementation for deterministic creation of model files void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k) { static const int qk = QK4_0; @@ -398,6 +436,26 @@ void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRI } } +void dequantize_row_q2_0(const block_q2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + static const int qk = QK2_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (int j = 0; j < qk; ++j) { + const int byte_index = j / 4; + const int bit_offset = (j % 4) * 2; + const uint8_t q = (x[i].qs[byte_index] >> bit_offset) & 0x03; + // 00=-1, 01=0, 10=+1, 11=+2 + y[i*qk + j] = ((int)q - 1) * d; + } + } +} + void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { static const int qk = QK4_0; @@ -2052,6 +2110,20 @@ size_t quantize_q1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, return nrow * row_size; } +size_t quantize_q2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + if (!quant_weights) { + quantize_row_q2_0_ref(src, dst, (int64_t)nrow*n_per_row); + return nrow * ggml_row_size(GGML_TYPE_Q2_0, n_per_row); + } + size_t row_size = ggml_row_size(GGML_TYPE_Q2_0, n_per_row); + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_q2_0_ref(src, (block_q2_0*)qrow, n_per_row); + src += n_per_row; + qrow += row_size; + } + return nrow * row_size; +} size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { if (!quant_weights) { @@ -5461,6 +5533,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_q1_0, data, nb); } break; + case GGML_TYPE_Q2_0: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q2_0, data, nb); + } break; case GGML_TYPE_Q4_0: { VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb); diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index d56c86da8909..75188f1af180 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -15,6 +15,7 @@ extern "C" { // Quantization GGML_API void quantize_row_q1_0_ref(const float * GGML_RESTRICT x, block_q1_0 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q2_0_ref(const float * GGML_RESTRICT x, block_q2_0 * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k); @@ -43,6 +44,7 @@ GGML_API void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_ // Dequantization GGML_API void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q2_0(const block_q2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); @@ -93,6 +95,7 @@ GGML_API size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTR GGML_API size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 387826b6d932..c200f08e2f6c 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -4058,6 +4058,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { } #endif CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q1_0], matmul_q1_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q2_0], matmul_q2_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_0], matmul_q4_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_1], matmul_q4_1_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM2(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_0], matmul_q5_0_f16, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) @@ -4089,6 +4090,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { } #endif CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_subgroup_q1_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) + CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_0], matmul_id_subgroup_q2_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) CREATE_MM2(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_subgroup_q5_0_f16, mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 5) @@ -4152,6 +4154,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { if (device->coopmat_acc_f16_support) { CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0], matmul_q1_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(GGML_TYPE_Q2_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_0], matmul_q2_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0], matmul_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1], matmul_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0], matmul_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); @@ -4176,6 +4179,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { CREATE_MM2(GGML_TYPE_NVFP4, pipeline_dequant_mul_mat_mat[GGML_TYPE_NVFP4], matmul_nvfp4_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); } else { CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0].f32acc, matmul_q1_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(GGML_TYPE_Q2_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_0].f32acc, matmul_q2_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); @@ -4212,6 +4216,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { #endif CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_subgroup_q1_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); + CREATE_MM2(GGML_TYPE_Q2_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_0], matmul_id_subgroup_q2_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_subgroup_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id); @@ -4293,6 +4298,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { CREATE_MM_NODOT2(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0], matmul_q1_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM2(GGML_TYPE_Q2_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_0], matmul_q2_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0], matmul_q4_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1], matmul_q4_1_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0], matmul_q5_0_f32, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); @@ -4339,6 +4345,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_subgroup_f16_f32, wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); CREATE_MM_NODOT2(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_subgroup_q1_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM2(GGML_TYPE_Q2_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_0], matmul_id_subgroup_q2_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_subgroup_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_subgroup_q4_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_subgroup_q5_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); @@ -4384,6 +4391,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM_NODOT2(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0], matmul_id_q1_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM2(GGML_TYPE_Q2_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_0], matmul_id_q2_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM2(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); @@ -4460,6 +4468,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_bf16, matmul_bf16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q1_0].f32acc, matmul_q1_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); + CREATE_MM(GGML_TYPE_Q2_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_0].f32acc, matmul_q2_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, , 0); @@ -4506,6 +4515,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_subgroup_bf16, , wg_denoms, warptile_id, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size_16); CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0].f32acc, matmul_id_subgroup_q1_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); + CREATE_MM(GGML_TYPE_Q2_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_0].f32acc, matmul_id_subgroup_q2_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_subgroup_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_subgroup_q4_1_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_subgroup_q5_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, mul_mat_subgroup_size); @@ -4534,6 +4544,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { CREATE_MM(GGML_TYPE_BF16, pipeline_matmul_id_bf16, matmul_id_bf16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_Q1_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q1_0].f32acc, matmul_id_q1_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); + CREATE_MM(GGML_TYPE_Q2_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_0].f32acc, matmul_id_q2_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, mul_mat_id_param_count, _id, 0); @@ -4628,6 +4639,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32", arr_dmmv_f16_f32_f32_len[reduc], arr_dmmv_f16_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f32_f32", arr_dmmv_bf16_f32_f32_len[reduc], arr_dmmv_bf16_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q1_0][i], "mul_mat_vec_q1_0_f32_f32", arr_dmmv_q1_0_f32_f32_len[reduc], arr_dmmv_q1_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q2_0][i], "mul_mat_vec_q2_0_f32_f32", arr_dmmv_q2_0_f32_f32_len[reduc], arr_dmmv_q2_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32", arr_dmmv_q4_0_f32_f32_len[reduc], arr_dmmv_q4_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f32_f32", arr_dmmv_q4_1_f32_f32_len[reduc], arr_dmmv_q4_1_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f32_f32", arr_dmmv_q5_0_f32_f32_len[reduc], arr_dmmv_q5_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); @@ -4654,6 +4666,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32", arr_dmmv_f16_f16_f32_len[reduc], arr_dmmv_f16_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f16_f32", arr_dmmv_bf16_f16_f32_len[reduc], arr_dmmv_bf16_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q1_0][i], "mul_mat_vec_q1_0_f16_f32", arr_dmmv_q1_0_f16_f32_len[reduc], arr_dmmv_q1_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q2_0][i], "mul_mat_vec_q2_0_f16_f32", arr_dmmv_q2_0_f16_f32_len[reduc], arr_dmmv_q2_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32", arr_dmmv_q4_0_f16_f32_len[reduc], arr_dmmv_q4_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f16_f32", arr_dmmv_q4_1_f16_f32_len[reduc], arr_dmmv_q4_1_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f16_f32", arr_dmmv_q5_0_f16_f32_len[reduc], arr_dmmv_q5_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size); @@ -4706,6 +4719,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", arr_dmmv_id_f16_f32_f32_len[reduc], arr_dmmv_id_f16_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {wg_size_subgroup, 2}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_BF16], "mul_mat_vec_id_bf16_f32", arr_dmmv_id_bf16_f32_f32_len[reduc], arr_dmmv_id_bf16_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {wg_size_subgroup, 2}, 1, false, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q1_0], "mul_mat_vec_id_q1_0_f32", arr_dmmv_id_q1_0_f32_f32_len[reduc], arr_dmmv_id_q1_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q2_0], "mul_mat_vec_id_q2_0_f32", arr_dmmv_id_q2_0_f32_f32_len[reduc], arr_dmmv_id_q2_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", arr_dmmv_id_q4_0_f32_f32_len[reduc], arr_dmmv_id_q4_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", arr_dmmv_id_q4_1_f32_f32_len[reduc], arr_dmmv_id_q4_1_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", arr_dmmv_id_q5_0_f32_f32_len[reduc], arr_dmmv_id_q5_0_f32_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size); @@ -4762,6 +4776,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { // dequant shaders ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q1_0], "dequant_q1_0", dequant_q1_0_len, dequant_q1_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 8, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q2_0], "dequant_q2_0", dequant_q2_0_len, dequant_q2_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 8, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_0], "dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_1], "dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); @@ -4789,6 +4804,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_BF16], "get_rows_bf16", get_rows_bf16_len, get_rows_bf16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q1_0], "get_rows_q1_0", get_rows_q1_0_len, get_rows_q1_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q2_0], "get_rows_q2_0", get_rows_q2_0_len, get_rows_q2_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); @@ -4816,6 +4832,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_BF16], "get_rows_bf16_f32", get_rows_bf16_f32_len, get_rows_bf16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q1_0], "get_rows_q1_0_f32", get_rows_q1_0_f32_len, get_rows_q1_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q2_0], "get_rows_q2_0_f32", get_rows_q2_0_f32_len, get_rows_q2_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); @@ -4899,6 +4916,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_cpy_transpose_16, "cpy_transpose_16", cpy_transpose_16_len, cpy_transpose_16_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q1_0], "cpy_f32_q1_0", cpy_f32_q1_0_len, cpy_f32_q1_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q2_0], "cpy_f32_q2_0", cpy_f32_q2_0_len, cpy_f32_q2_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1); @@ -4911,6 +4929,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_F16], "set_rows_f16" #itype, set_rows_f16 ## itype ## _len, set_rows_f16 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_BF16], "set_rows_bf16" #itype, set_rows_bf16 ## itype ## _len, set_rows_bf16 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q1_0], "set_rows_q1_0" #itype, set_rows_q1_0 ## itype ## _len, set_rows_q1_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q2_0], "set_rows_q2_0" #itype, set_rows_q2_0 ## itype ## _len, set_rows_q2_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_0], "set_rows_q4_0" #itype, set_rows_q4_0 ## itype ## _len, set_rows_q4_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q4_1], "set_rows_q4_1" #itype, set_rows_q4_1 ## itype ## _len, set_rows_q4_1 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_set_rows ## itype [GGML_TYPE_Q5_0], "set_rows_q5_0" #itype, set_rows_q5_0 ## itype ## _len, set_rows_q5_0 ## itype ## _data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true); \ @@ -4924,6 +4943,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q1_0], "cpy_q1_0_f32", cpy_q1_0_f32_len, cpy_q1_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q1_0), 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q2_0], "cpy_q2_0_f32", cpy_q2_0_f32_len, cpy_q2_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q2_0), 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_1], "cpy_q4_1_f32", cpy_q4_1_f32_len, cpy_q4_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_0], "cpy_q5_0_f32", cpy_q5_0_f32_len, cpy_q5_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1); @@ -6800,6 +6820,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type switch (type) { case GGML_TYPE_F32: case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -6873,6 +6894,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte switch (src0_type) { case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -6940,6 +6962,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -7032,6 +7055,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co switch (src0_type) { case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -7102,6 +7126,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -8099,6 +8124,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const if (src->type == GGML_TYPE_F32) { switch (to) { case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -8114,6 +8140,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const if (to == GGML_TYPE_F32) { switch (src->type) { case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -16656,6 +16683,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -16761,6 +16789,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -16795,6 +16824,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -16819,6 +16849,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -16835,6 +16866,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp index 710c15296da2..6cf7cf904aa1 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp @@ -208,6 +208,32 @@ void quantize(uint dst_idx, uint src_idx) } #endif +#if defined(DATA_A_Q2_0) +void quantize(uint dst_idx, uint src_idx) +{ + float amax = 0.0; + + [[unroll]] for (int j = 0; j < QUANT_K; ++j) { + amax = max(amax, abs(data_s[src_idx + j])); + } + + const float d = amax; + const float id = (d > 0.0) ? 1.0/d : 0.0; + + data_q[dst_idx].d = float16_t(d); + + [[unroll]] for (int j = 0; j < QUANT_K / 4; ++j) { + data_q[dst_idx].qs[j] = uint8_t(0); + } + + [[unroll]] for (int j = 0; j < QUANT_K; ++j) { + int q = int(round(data_s[src_idx + j] * id)) + 1; + q = clamp(q, 0, 3); + data_q[dst_idx].qs[j / 4] |= uint8_t(q << ((j % 4) * 2)); + } +} +#endif + #if defined(DATA_A_IQ4_NL) uint best_index(float x) { if (x <= kvalues_iq4nl[0]) return 0; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl index e67299fdeca0..ad5f65219deb 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl @@ -143,6 +143,25 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) { } #endif +#if defined(DATA_A_Q2_0) +vec2 dequantize(uint ib, uint iqs, uint a_offset) { + const uint byte_val = uint(data_a[a_offset + ib].qs[iqs / 4u]); + const uint shift = (iqs % 4u) * 2u; + return vec2( + float(int((byte_val >> shift) & 3u) - 1), + float(int((byte_val >> (shift + 2u)) & 3u) - 1)); +} +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const uint byte_val = uint(data_a[a_offset + ib].qs[iqs / 4u]); + const uint shift = (iqs % 4u) * 2u; + return vec4( + float(int((byte_val >> shift) & 3u) - 1), + float(int((byte_val >> (shift + 2u)) & 3u) - 1), + float(int((byte_val >> (shift + 4u)) & 3u) - 1), + float(int((byte_val >> (shift + 6u)) & 3u) - 1)); +} +#endif + #if defined(DATA_A_IQ1_S) vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint ib32 = iqs / 32; @@ -536,6 +555,12 @@ vec2 get_dm(uint ib, uint a_offset) { } #endif +#if defined(DATA_A_Q2_0) +vec2 get_dm(uint ib, uint a_offset) { + return vec2(float(data_a[a_offset + ib].d), 0); +} +#endif + #if defined(DATA_A_MXFP4) vec2 get_dm(uint ib, uint a_offset) { return vec2(e8m0_to_fp32(data_a[a_offset + ib].e), 0); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl index 7171cbfa5599..75a431034658 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl @@ -46,6 +46,19 @@ f16vec4 dequantFuncQ1_0_v(const in decodeBufQ1_0 bl, const in uint blockCoords[2 (qs_nib & 8u) != 0u ? d : md); } +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ2_0 { + block_q2_0 block; +}; + +float16_t dequantFuncQ2_0(const in decodeBufQ2_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint byte_val = uint(bl.block.qs[idx >> 2]); + const uint shift = (idx & 3u) * 2u; + return float16_t(int((byte_val >> shift) & 3u) - 1) * d; +} + layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 { block_q4_0_packed16 block; }; @@ -1304,6 +1317,8 @@ f16vec4 dequantFuncNVFP4_v(const in decodeBufNVFP4 bl, const in uint blockCoords #if defined(DATA_A_Q1_0) #define dequantFuncA dequantFuncQ1_0 #define dequantFuncA_v dequantFuncQ1_0_v +#elif defined(DATA_A_Q2_0) +#define dequantFuncA dequantFuncQ2_0 #elif defined(DATA_A_Q4_0) #define dequantFuncA dequantFuncQ4_0 #define dequantFuncA_v dequantFuncQ4_0_v diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_0.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_0.comp new file mode 100644 index 000000000000..49383d7d308b --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_0.comp @@ -0,0 +1,30 @@ +#version 450 + +#include "dequant_head.glsl" + +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A {block_q2_0 data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; + +// Q2_0 group 64: 64 weights per block, 16 packed bytes (8 byte-pairs). +// Each invocation dequantizes 8 contiguous weights (one byte-pair: qs[bp*2], qs[bp*2+1]), +// matching the previous 8-weights-per-thread / 2048-weights-per-workgroup dispatch. +void main() { + const uint elem = (gl_WorkGroupID.x * 256 + gl_LocalInvocationID.x) * 8; + if (elem >= p.nel) { + return; + } + + const uint ib = elem / 64; // block index + const uint bp = (elem % 64) / 8; // byte-pair within the block (0..7) + + const float d = float(data_a[ib].d); + const uint b0 = uint(data_a[ib].qs[bp*2 ]); + const uint b1 = uint(data_a[ib].qs[bp*2 + 1]); + + [[unroll]] for (uint l = 0; l < 4; ++l) { + data_b[elem + l ] = D_TYPE(float(int((b0 >> (l*2u)) & 3u) - 1) * d); + data_b[elem + l + 4] = D_TYPE(float(int((b1 >> (l*2u)) & 3u) - 1) * d); + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl index 73595168984c..44d59089f297 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl @@ -144,6 +144,23 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin buf_a[buf_idx + 1] = FLOAT_TYPEV2((bits & 0x04u) != 0u ? d : -d, (bits & 0x08u) != 0u ? d : -d); buf_a[buf_idx + 2] = FLOAT_TYPEV2((bits & 0x10u) != 0u ? d : -d, (bits & 0x20u) != 0u ? d : -d); buf_a[buf_idx + 3] = FLOAT_TYPEV2((bits & 0x40u) != 0u ? d : -d, (bits & 0x80u) != 0u ? d : -d); +#elif defined(DATA_A_Q2_0) + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; + + // qs bytes per block = QUANT_K/4 (16 for Q2_0 group 64, 32 for PQ2_0 group 128) + const uint ib = idx / (QUANT_K / 4u); + const uint iqs = idx % (QUANT_K / 4u); + + const float d = float(data_a[ib].d); + const uint byte_val = uint(data_a[ib].qs[iqs]); + + buf_a[buf_idx ] = FLOAT_TYPEV2( + float(int( byte_val & 3u) - 1) * d, + float(int((byte_val >> 2u) & 3u) - 1) * d); + buf_a[buf_idx + 1] = FLOAT_TYPEV2( + float(int((byte_val >> 4u) & 3u) - 1) * d, + float(int((byte_val >> 6u) & 3u) - 1) * d); #elif defined(DATA_A_Q2_K) const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl index 8c6b20c68894..ef21a31e9590 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl @@ -206,6 +206,23 @@ struct block_q1_0 #define A_TYPE block_q1_0 #endif +#define QUANT_K_Q2_0 64 +#define QUANT_R_Q2_0 1 + +struct block_q2_0 +{ + float16_t d; + uint8_t qs[QUANT_K_Q2_0 / 4]; +}; + +#if defined(DATA_A_Q2_0) +#define QUANT_K QUANT_K_Q2_0 +#define QUANT_R QUANT_R_Q2_0 +#define QUANT_AUXF 1 +#define A_TYPE block_q2_0 +#endif + + #define QUANT_K_Q8_1 32 #define QUANT_R_Q8_1 1 diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 7bcb14608148..4c47ec1b8539 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -46,6 +46,7 @@ const std::vector type_names = { "f32", "f16", "q1_0", + "q2_0", "q4_0", "q4_1", "q5_0", @@ -570,7 +571,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c std::string load_vec_quant = "2"; if ((tname == "q1_0") || (tname == "q4_0") || (tname == "q4_1") || (tname == "q5_1") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s")) load_vec_quant = "8"; - else if ((tname == "q5_0") || (tname == "q8_0") || (tname == "q2_k") || (tname == "q4_k") || (tname == "q5_k") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_xs") || (tname == "iq4_nl") || (tname == "mxfp4") || (tname == "nvfp4")) + else if ((tname == "q2_0") || (tname == "q5_0") || (tname == "q8_0") || (tname == "q2_k") || (tname == "q4_k") || (tname == "q5_k") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_xs") || (tname == "iq4_nl") || (tname == "mxfp4") || (tname == "nvfp4")) load_vec_quant = "4"; if (tname == "bf16") { @@ -791,12 +792,12 @@ void process_shaders() { string_to_spv("cpy_transpose_16", "copy_transpose.comp", {{"A_TYPE", "uint16_t"}, {"D_TYPE", "uint16_t"}}); string_to_spv("cpy_transpose_32", "copy_transpose.comp", {{"A_TYPE", "uint"}, {"D_TYPE", "uint"}}); - for (std::string t : {"q1_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) { + for (std::string t : {"q1_0", "q2_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) { string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); } - for (std::string t : {"f32", "f16", "bf16", "q1_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) { + for (std::string t : {"f32", "f16", "bf16", "q1_0", "q2_0", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) { string_to_spv("set_rows_" + t + "_i32", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uint"}, {"B_SIZE", "32"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("set_rows_" + t + "_i64", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"B_SIZE", "64"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b43016c87d21..3d682dcb2af1 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -674,6 +674,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q1_0, .from_float_ref = (ggml_from_float_t) quantize_row_q1_0_ref, }, + [GGML_TYPE_Q2_0] = { + .type_name = "q2_0", + .blck_size = QK2_0, + .type_size = sizeof(block_q2_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q2_0, + .from_float_ref = (ggml_from_float_t) quantize_row_q2_0_ref, + }, [GGML_TYPE_Q4_0] = { .type_name = "q4_0", .blck_size = QK4_0, @@ -1410,6 +1418,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break; case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break; case GGML_FTYPE_MOSTLY_Q1_0: wtype = GGML_TYPE_Q1_0; break; + case GGML_FTYPE_MOSTLY_Q2_0: wtype = GGML_TYPE_Q2_0; break; case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break; case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break; case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break; @@ -7732,6 +7741,7 @@ size_t ggml_quantize_chunk( switch (type) { case GGML_TYPE_Q1_0: result = quantize_q1_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q2_0: result = quantize_q2_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_0: result = quantize_q4_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_1: result = quantize_q4_1 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q5_0: result = quantize_q5_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 584594097346..2ebdd52bd12e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -4356,6 +4356,7 @@ class GGMLQuantizationType(IntEnum): MXFP4 = 39 NVFP4 = 40 Q1_0 = 41 + Q2_0 = 42 class ExpertGatingFuncType(IntEnum): @@ -4410,6 +4411,7 @@ class LlamaFileType(IntEnum): MOSTLY_MXFP4_MOE = 38 # except 1d tensors MOSTLY_NVFP4 = 39 # except 1d tensors MOSTLY_Q1_0 = 40 # except 1d tensors + MOSTLY_Q2_0 = 41 # except 1d tensors GUESSED = 1024 # not specified in the model file @@ -4535,6 +4537,7 @@ class VisionProjectorType: GGMLQuantizationType.MXFP4: (32, 1 + 16), GGMLQuantizationType.NVFP4: (64, 4 + 32), GGMLQuantizationType.Q1_0: (128, 2 + 16), + GGMLQuantizationType.Q2_0: (64, 2 + 16), } diff --git a/include/llama.h b/include/llama.h index 27e480674282..4ea072e8d11b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -155,6 +155,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q1_0 = 40, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_0 = 41, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 0d1cf3cc33bb..b211950740d8 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -37,6 +37,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_F16: return "F16"; case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; case LLAMA_FTYPE_MOSTLY_Q1_0: return "Q1_0"; + case LLAMA_FTYPE_MOSTLY_Q2_0: return "Q2_0"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; @@ -761,6 +762,7 @@ llama_model_loader::llama_model_loader( case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break; case GGML_TYPE_Q1_0: ftype = LLAMA_FTYPE_MOSTLY_Q1_0; break; + case GGML_TYPE_Q2_0: ftype = LLAMA_FTYPE_MOSTLY_Q2_0; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index cf92ce4bb8b7..140974dc36ac 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -380,6 +380,7 @@ static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tenso case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_S: // types on the right: block size 32 case GGML_TYPE_IQ4_XS: return_type = GGML_TYPE_IQ4_NL; break; + case GGML_TYPE_Q2_0: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_TQ1_0: @@ -480,7 +481,7 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { + else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0 || ftype == LLAMA_FTYPE_MOSTLY_Q2_0) { new_type = GGML_TYPE_Q4_K; } } @@ -800,6 +801,7 @@ ggml_type llama_ftype_get_default_type(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_BF16: return GGML_TYPE_BF16; case LLAMA_FTYPE_ALL_F32: return GGML_TYPE_F32; case LLAMA_FTYPE_MOSTLY_Q1_0: return GGML_TYPE_Q1_0; + case LLAMA_FTYPE_MOSTLY_Q2_0: return GGML_TYPE_Q2_0; case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return GGML_TYPE_MXFP4; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e284a58d1c67..4445425bc737 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -213,6 +213,7 @@ llama_build_and_test( peg-parser/tests.h ) llama_build_and_test(test-regex-partial.cpp) +llama_build_and_test(test-vulkan-q2_0-shader-sim.cpp) if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x") set(MODEL_NAME "tinyllamas/stories15M-q4_0.gguf") diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 8705da20b1dc..d90d1c95719b 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7597,6 +7597,7 @@ static const ggml_type all_types[] = { GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0, GGML_TYPE_Q1_0, + GGML_TYPE_Q2_0, GGML_TYPE_MXFP4, GGML_TYPE_NVFP4, GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, @@ -7611,6 +7612,7 @@ static const ggml_type base_types[] = { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, // for I8MM tests GGML_TYPE_Q1_0, + GGML_TYPE_Q2_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, // for I8MM tests GGML_TYPE_Q4_K, @@ -7623,6 +7625,7 @@ static const ggml_type other_types[] = { GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0, GGML_TYPE_Q1_0, + GGML_TYPE_Q2_0, GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index a05fab50421f..b79e3d193b7a 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -150,6 +150,7 @@ int main(int argc, char * argv[]) { type == GGML_TYPE_Q1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_BINARY : type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : + type == GGML_TYPE_Q2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : @@ -175,7 +176,7 @@ int main(int argc, char * argv[]) { ? MAX_DOT_PRODUCT_ERROR_LOWBIT : type == GGML_TYPE_Q1_0 ? MAX_DOT_PRODUCT_ERROR_BINARY - : type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0 + : type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0 || type == GGML_TYPE_Q2_0 ? MAX_DOT_PRODUCT_ERROR_TERNARY : type == GGML_TYPE_NVFP4 ? MAX_DOT_PRODUCT_ERROR_FP4 diff --git a/tests/test-vulkan-q2_0-shader-sim.cpp b/tests/test-vulkan-q2_0-shader-sim.cpp new file mode 100644 index 000000000000..9ff7526e0449 --- /dev/null +++ b/tests/test-vulkan-q2_0-shader-sim.cpp @@ -0,0 +1,959 @@ +// cpu simulation of the q2_0 vulkan shader functions. +// +// this file is one step of a three-step proof of extensional equivalence between +// the q2_0 glsl shader code and the cpu reference in ggml-quants.c. +// +// the first step lives in this file: the c++ functions named sim_* are literal +// text-level translations of the glsl functions, with the glsl source quoted in +// the comment immediately above each sim_*. a reader can verify by visual +// inspection that the c++ and the glsl compute the same value. +// +// the second step also lives in this file: the simulator runs against the cpu +// reference for randomized blocks and for every byte value x every slot x every +// alignment (exhaustive). passing this proves that the bit-extraction pattern +// is correct. +// +// the third step is a separate run of test-backend-ops on vulkan. once the +// shaders are compiled to spir-v and run on an actual gpu, test-backend-ops +// compares vulkan-backed tensor ops (get_rows, mul_mat, cpy, set_rows) against +// the cpu backend. that's the only step that proves the glsl itself is correct +// end-to-end, since steps one and two only validate the c++ stand-in. +// +// steps one and two catch nearly all transcription errors before the slow gpu +// build. step three covers the rest (glsl -> spir-v compiler quirks, driver +// issues, memory layout, etc) and is required to declare the shader correct. +// +// files this simulator stands in for: +// ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl (matvec) +// ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_0.comp (standalone dequant) +// ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl (cooperative matrix 2) +// ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl (matmul) +// ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp (f32 -> q2_0) + +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include + +// block format (matches ggml-common.h:187-192 and types.glsl:207-214) + +static const int QK2_0 = 128; + +struct block_q2_0 { + uint16_t d; // fp16 raw bits + uint8_t qs[QK2_0/4]; // 32 bytes +}; +static_assert(sizeof(block_q2_0) == 2 + 32, "block_q2_0 must be 34 bytes"); + +// fp16 <-> fp32 (ieee 754 half precision). +// +// we need a behavioural match with ggml's GGML_FP32_TO_FP16 / GGML_FP16_TO_FP32. +// the simplest deterministic conversion is via memcpy bit-punning between float +// and uint32_t. we use the standard bit-twiddling versions, with one important +// invariant ggml requires: a finite fp32 input that overflows the fp16 range +// must clamp to (signed) infinity, NOT become NaN. a NaN result is reserved +// for actual fp32 NaN inputs (raw exponent 0xff, non-zero mantissa) + +static float fp16_to_fp32(uint16_t h) { + const uint32_t s = (h & 0x8000u) << 16; + const uint32_t e = (h & 0x7c00u) >> 10; + const uint32_t m = (h & 0x03ffu); + uint32_t f; + if (e == 0) { + if (m == 0) { + f = s; // signed zero + } else { + // subnormal: normalize + uint32_t mm = m; + int shift = 0; + while ((mm & 0x0400u) == 0) { + mm <<= 1; + ++shift; + } + mm &= 0x03ffu; + const uint32_t ee = (uint32_t)(127 - 15 - shift + 1); + f = s | (ee << 23) | (mm << 13); + } + } else if (e == 31) { + f = s | 0x7f800000u | (m << 13); // inf or nan + } else { + const uint32_t ee = e + (127 - 15); + f = s | (ee << 23) | (m << 13); + } + float out; + std::memcpy(&out, &f, 4); + return out; +} + +static uint16_t fp32_to_fp16(float x) { + uint32_t f; + std::memcpy(&f, &x, 4); + const uint32_t s = (f >> 16) & 0x8000u; + const uint32_t raw_e = (f >> 23) & 0xffu; + uint32_t m = f & 0x7fffffu; + int32_t e = (int32_t)raw_e - 127 + 15; + + // fp32 nan or inf (raw exponent all-ones) + if (raw_e == 0xffu) { + if (m == 0) return (uint16_t)(s | 0x7c00u); // signed inf + const uint32_t hm = m >> 13; // top 10 bits of mantissa + return (uint16_t)(s | 0x7c00u | (hm ? hm : 0x200u)); // nan, preserve a payload + } + + // finite, but exponent overflows fp16. clamp to signed inf (matches ggml) + if (e >= 31) { return (uint16_t)(s | 0x7c00u); } + + // subnormal or underflow + if (e <= 0) { + if (e < -10) return (uint16_t)s; + m = (m | 0x800000u) >> (1 - e); + // round to nearest even + if (m & 0x1000u) m += 0x2000u; + return (uint16_t)(s | (m >> 13)); + } + + // normal range, with rounding (and possible carry into exponent) + if (m & 0x1000u) { + m += 0x2000u; + if (m & 0x800000u) { + m = 0; + e += 1; + if (e >= 31) return (uint16_t)(s | 0x7c00u); + } + } + return (uint16_t)(s | ((uint32_t)e << 10) | (m >> 13)); +} + +// cpu reference (mirror of ggml-quants.c) + +static void cpu_dequantize_row_q2_0(const block_q2_0 * x, float * y, int64_t k) { + assert(k % QK2_0 == 0); + const int nb = (int)(k / QK2_0); + for (int i = 0; i < nb; ++i) { + const float d = fp16_to_fp32(x[i].d); + for (int j = 0; j < QK2_0; ++j) { + const int byte_index = j / 4; + const int bit_offset = (j % 4) * 2; + const uint8_t q = (x[i].qs[byte_index] >> bit_offset) & 0x03; + y[i*QK2_0 + j] = ((int)q - 1) * d; + } + } +} + +static void cpu_quantize_row_q2_0(const float * x, block_q2_0 * y, int64_t k) { + assert(k % QK2_0 == 0); + const int nb = (int)(k / QK2_0); + for (int i = 0; i < nb; ++i) { + float amax = 0.0f; + for (int j = 0; j < QK2_0; ++j) { + const float a = std::fabs(x[i*QK2_0 + j]); + if (a > amax) amax = a; + } + const float d = amax; + const float id = d > 0.0f ? 1.0f/d : 0.0f; + y[i].d = fp32_to_fp16(d); + for (int j = 0; j < QK2_0/4; ++j) y[i].qs[j] = 0; + for (int j = 0; j < QK2_0; ++j) { + int q = (int)std::round(x[i*QK2_0 + j] * id) + 1; + if (q < 0) q = 0; + if (q > 3) q = 3; + y[i].qs[j/4] |= (uint8_t)(q << ((j%4)*2)); + } + } +} + +// shader simulators: literal text-level translations of the glsl. +// each sim_* function is preceded by a quote of the corresponding glsl. inspect +// side-by-side to confirm the translation is faithful: same operations, same +// operand types, same evaluation order. the c++ uses uint32_t for glsl `uint`, +// std::uint8_t for glsl `uint8_t`, and float for glsl `float`. bit operations +// (>>, &, |) and integer arithmetic on non-negative values are bit-identical +// between glsl and c++ + +// dequant_funcs.glsl +struct vec2f { float x, y; }; +struct vec4f { float x, y, z, w; }; + +// GLSL (dequant_funcs.glsl): +// #if defined(DATA_A_Q2_0) +// vec2 dequantize(uint ib, uint iqs, uint a_offset) { +// const uint byte_val = uint(data_a[a_offset + ib].qs[iqs / 4u]); +// const uint shift = (iqs % 4u) * 2u; +// return vec2( +// float(int((byte_val >> shift) & 3u) - 1), +// float(int((byte_val >> (shift + 2u)) & 3u) - 1)); +// } +// #endif +static vec2f sim_dequant_funcs_dequantize(const block_q2_0 * data_a, uint32_t a_offset, + uint32_t ib, uint32_t iqs) { + const uint32_t byte_val = (uint32_t)data_a[a_offset + ib].qs[iqs / 4u]; + const uint32_t shift = (iqs % 4u) * 2u; + return { + (float)((int)((byte_val >> shift) & 3u) - 1), + (float)((int)((byte_val >> (shift + 2u)) & 3u) - 1) + }; +} + +// GLSL (dequant_funcs.glsl): +// vec4 dequantize4(uint ib, uint iqs, uint a_offset) { +// const uint byte_val = uint(data_a[a_offset + ib].qs[iqs / 4u]); +// const uint shift = (iqs % 4u) * 2u; +// return vec4( +// float(int((byte_val >> shift) & 3u) - 1), +// float(int((byte_val >> (shift + 2u)) & 3u) - 1), +// float(int((byte_val >> (shift + 4u)) & 3u) - 1), +// float(int((byte_val >> (shift + 6u)) & 3u) - 1)); +// } +static vec4f sim_dequant_funcs_dequantize4(const block_q2_0 * data_a, uint32_t a_offset, + uint32_t ib, uint32_t iqs) { + const uint32_t byte_val = (uint32_t)data_a[a_offset + ib].qs[iqs / 4u]; + const uint32_t shift = (iqs % 4u) * 2u; + return { + (float)((int)((byte_val >> shift) & 3u) - 1), + (float)((int)((byte_val >> (shift + 2u)) & 3u) - 1), + (float)((int)((byte_val >> (shift + 4u)) & 3u) - 1), + (float)((int)((byte_val >> (shift + 6u)) & 3u) - 1) + }; +} + +// GLSL (dequant_funcs.glsl): +// #if defined(DATA_A_Q2_0) +// vec2 get_dm(uint ib, uint a_offset) { +// return vec2(float(data_a[a_offset + ib].d), 0); +// } +// #endif +static vec2f sim_dequant_funcs_get_dm(const block_q2_0 * data_a, uint32_t a_offset, uint32_t ib) { + return { fp16_to_fp32(data_a[a_offset + ib].d), 0.0f }; +} + +// GLSL (dequant_funcs_cm2.glsl): +// layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ2_0 { +// block_q2_0 block; +// }; +// float16_t dequantFuncQ2_0(const in decodeBufQ2_0 bl, +// const in uint blockCoords[2], +// const in uint coordInBlock[2]) +// { +// const float16_t d = bl.block.d; +// const uint idx = coordInBlock[1]; +// const uint byte_val = uint(bl.block.qs[idx >> 2]); +// const uint shift = (idx & 3u) * 2u; +// return float16_t(int((byte_val >> shift) & 3u) - 1) * d; +// } +// +// note. glsl uses fp16 here (float16_t), the simulator uses fp32 because the +// test compares against the fp32 cpu reference. the fp16 variant rounds the +// per-element multiplication. we test that separately in test_cm2_fp16 below +static float sim_cm2_dequantFuncQ2_0(const block_q2_0 * bl, uint32_t coordInBlock_1) { + const float d = fp16_to_fp32(bl->d); + const uint32_t idx = coordInBlock_1; + const uint32_t byte_val = (uint32_t)bl->qs[idx >> 2]; + const uint32_t shift = (idx & 3u) * 2u; + return (float)((int)((byte_val >> shift) & 3u) - 1) * d; +} + +// GLSL (mul_mm_funcs.glsl, Q2_0 branch): +// #elif defined(DATA_A_Q2_0) +// const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; +// const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; +// +// const uint ib = idx / 32; +// const uint iqs = idx & 0x1fu; +// +// const float d = float(data_a[ib].d); +// const uint byte_val = uint(data_a[ib].qs[iqs]); +// +// buf_a[buf_idx ] = FLOAT_TYPEV2( +// float(int( byte_val & 3u) - 1) * d, +// float(int((byte_val >> 2u) & 3u) - 1) * d); +// buf_a[buf_idx + 1] = FLOAT_TYPEV2( +// float(int((byte_val >> 4u) & 3u) - 1) * d, +// float(int((byte_val >> 6u) & 3u) - 1) * d); +// +// the simulator returns the 4 floats this thread writes (the 2 vec2 pairs +// flattened). we omit the `pos_a + col * stride_a + row` index arithmetic +// because that's the matmul's address computation. the data-decoding part, +// which is what we're verifying, is just `idx` +struct mul_mm_load_result { float v[4]; }; + +static mul_mm_load_result sim_mul_mm_load_a_Q2_0(const block_q2_0 * data_a, uint32_t idx) { + const uint32_t ib = idx / 32; + const uint32_t iqs = idx & 0x1fu; + const float d = fp16_to_fp32(data_a[ib].d); + const uint32_t bv = (uint32_t)data_a[ib].qs[iqs]; + + mul_mm_load_result r; + r.v[0] = (float)((int)( bv & 3u) - 1) * d; + r.v[1] = (float)((int)((bv >> 2u) & 3u) - 1) * d; + r.v[2] = (float)((int)((bv >> 4u) & 3u) - 1) * d; + r.v[3] = (float)((int)((bv >> 6u) & 3u) - 1) * d; + return r; +} + +// GLSL (copy_to_quant.comp, Q2_0 branch): +// #if defined(DATA_A_Q2_0) +// void quantize(uint dst_idx, uint src_idx) +// { +// float amax = 0.0; +// [[unroll]] for (int j = 0; j < QUANT_K_Q2_0; ++j) { +// amax = max(amax, abs(data_s[src_idx + j])); +// } +// const float d = amax; +// const float id = (d > 0.0) ? 1.0/d : 0.0; +// data_q[dst_idx].d = float16_t(d); +// [[unroll]] for (int j = 0; j < QUANT_K_Q2_0 / 4; ++j) { +// data_q[dst_idx].qs[j] = uint8_t(0); +// } +// [[unroll]] for (int j = 0; j < QUANT_K_Q2_0; ++j) { +// int q = int(round(data_s[src_idx + j] * id)) + 1; +// q = clamp(q, 0, 3); +// data_q[dst_idx].qs[j / 4] |= uint8_t(q << ((j % 4) * 2)); +// } +// } +// #endif +static void sim_copy_to_quant_Q2_0(const float * data_s, block_q2_0 * data_q, + uint32_t dst_idx, uint32_t src_idx) { + float amax = 0.0f; + for (int j = 0; j < QK2_0; ++j) { + amax = std::fmax(amax, std::fabs(data_s[src_idx + j])); + } + const float d = amax; + const float id = (d > 0.0f) ? 1.0f/d : 0.0f; + data_q[dst_idx].d = fp32_to_fp16(d); + + for (int j = 0; j < QK2_0/4; ++j) data_q[dst_idx].qs[j] = 0; + + for (int j = 0; j < QK2_0; ++j) { + int q = (int)std::round(data_s[src_idx + j] * id) + 1; + // GLSL clamp(int, 0, 3) == max(0, min(3, x)) + if (q < 0) q = 0; + if (q > 3) q = 3; + data_q[dst_idx].qs[j/4] |= (uint8_t)(q << ((j%4)*2)); + } +} + +// GLSL (dequant_q2_0.comp): +// #version 450 +// #include "dequant_head.glsl" +// layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; +// layout (binding = 0) readonly buffer A {block_q2_0 data_a[];}; +// layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; +// void main() { +// const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; +// const uint tid = gl_LocalInvocationID.x % 64; +// const uint il = tid / 4; +// const uint ir = tid % 4; +// const uint ib = 4*i + ir; +// if (ib >= p.nel / 128) return; +// const uint b_idx = 512*i + 128*ir + 8*il; +// const float d = float(data_a[ib].d); +// const uint b0 = uint(data_a[ib].qs[il*2 ]); +// const uint b1 = uint(data_a[ib].qs[il*2 + 1]); +// [[unroll]] for (uint l = 0; l < 4; ++l) { +// data_b[b_idx + l ] = D_TYPE(float(int((b0 >> (l*2u)) & 3u) - 1) * d); +// data_b[b_idx + l + 4] = D_TYPE(float(int((b1 >> (l*2u)) & 3u) - 1) * d); +// } +// } +// +// the simulator iterates every (workgroup, local_id) pair, executing the +// thread body. for each output element we record which thread wrote it and +// what value was written, then assert that every output index is covered +// exactly once and that the value matches the cpu reference +static void sim_dequant_q2_0_run(const block_q2_0 * data_a, uint32_t num_blocks, + std::vector & out, std::vector & writers) { + const uint32_t nel = num_blocks * QK2_0; + out.assign(nel, std::nanf("")); + writers.assign(nel, -1); + + // `p.nel / 128` from the shader + const uint32_t p_nel_div_128 = num_blocks; + + // 256 threads per workgroup, 16 blocks per workgroup, ceil(num_blocks/16) wgs + const uint32_t num_wg = (num_blocks + 15) / 16; + + int writer_id = 0; + for (uint32_t wg = 0; wg < num_wg; ++wg) { + for (uint32_t lid = 0; lid < 256; ++lid, ++writer_id) { + const uint32_t i = wg * 4u + lid / 64u; + const uint32_t tid = lid % 64u; + const uint32_t il = tid / 4u; // 0..15 + const uint32_t ir = tid % 4u; // 0..3 + const uint32_t ib = 4u*i + ir; + if (ib >= p_nel_div_128) continue; + + const uint32_t b_idx = 512u*i + 128u*ir + 8u*il; + + const float d = fp16_to_fp32(data_a[ib].d); + const uint32_t b0 = (uint32_t)data_a[ib].qs[il*2u ]; + const uint32_t b1 = (uint32_t)data_a[ib].qs[il*2u + 1u]; + + for (uint32_t l = 0; l < 4; ++l) { + const uint32_t i0 = b_idx + l; + const uint32_t i1 = b_idx + l + 4; + if (writers[i0] != -1 || writers[i1] != -1) { + std::fprintf(stderr, + "FAIL: dequant thread map overlap at i0=%u (prev writer %d, now %d)\n", + i0, writers[i0], writer_id); + std::abort(); + } + writers[i0] = writer_id; + writers[i1] = writer_id; + out[i0] = (float)((int)((b0 >> (l*2u)) & 3u) - 1) * d; + out[i1] = (float)((int)((b1 >> (l*2u)) & 3u) - 1) * d; + } + } + } +} + +// test helpers + +static int g_pass = 0; +static int g_fail = 0; + +static void check_eq_f32(const char * what, float a, float b, float tol = 0.0f) { + const float diff = std::fabs(a - b); + if (!(diff <= tol)) { + std::fprintf(stderr, "FAIL %s: %g vs %g (diff %g)\n", what, a, b, diff); + ++g_fail; + } else { + ++g_pass; + } +} + +static void check_eq_u8(const char * what, uint8_t a, uint8_t b) { + if (a != b) { + std::fprintf(stderr, "FAIL %s: 0x%02x vs 0x%02x\n", what, a, b); + ++g_fail; + } else { + ++g_pass; + } +} + +// build a random block: all 256 valid byte values x random scale +static block_q2_0 random_block(std::mt19937 & rng, float scale = 1.0f) { + block_q2_0 b; + std::uniform_real_distribution ds(0.001f, 4.0f); + b.d = fp32_to_fp16(scale > 0 ? scale : ds(rng)); + std::uniform_int_distribution bs(0, 255); + for (int j = 0; j < QK2_0/4; ++j) b.qs[j] = (uint8_t)bs(rng); + return b; +} + +// test 1: sim_dequant_funcs_dequantize matches cpu reference for every iqs + +static void test_dequantize() { + std::mt19937 rng(0xD2D20A55u); + const int num_blocks = 256; + std::vector blocks(num_blocks); + for (auto & b : blocks) b = random_block(rng); + + std::vector ref(num_blocks * QK2_0); + cpu_dequantize_row_q2_0(blocks.data(), ref.data(), num_blocks * QK2_0); + + // dequantize() pre: iqs % 2 == 0 + for (int ib = 0; ib < num_blocks; ++ib) { + for (uint32_t iqs = 0; iqs < QK2_0; iqs += 2) { + const vec2f sim = sim_dequant_funcs_dequantize(blocks.data(), 0, ib, iqs); + const float d = fp16_to_fp32(blocks[ib].d); + check_eq_f32("dequantize.x", sim.x * d, ref[ib*QK2_0 + iqs]); + check_eq_f32("dequantize.y", sim.y * d, ref[ib*QK2_0 + iqs + 1]); + } + } +} + +// test 2: sim_dequant_funcs_dequantize4 matches cpu reference for every aligned iqs + +static void test_dequantize4() { + std::mt19937 rng(0xC4C40A55u); + const int num_blocks = 256; + std::vector blocks(num_blocks); + for (auto & b : blocks) b = random_block(rng); + + std::vector ref(num_blocks * QK2_0); + cpu_dequantize_row_q2_0(blocks.data(), ref.data(), num_blocks * QK2_0); + + // dequantize4() pre: iqs % 4 == 0 + for (int ib = 0; ib < num_blocks; ++ib) { + for (uint32_t iqs = 0; iqs < QK2_0; iqs += 4) { + const vec4f sim = sim_dequant_funcs_dequantize4(blocks.data(), 0, ib, iqs); + const float d = fp16_to_fp32(blocks[ib].d); + check_eq_f32("dequantize4.x", sim.x * d, ref[ib*QK2_0 + iqs ]); + check_eq_f32("dequantize4.y", sim.y * d, ref[ib*QK2_0 + iqs + 1]); + check_eq_f32("dequantize4.z", sim.z * d, ref[ib*QK2_0 + iqs + 2]); + check_eq_f32("dequantize4.w", sim.w * d, ref[ib*QK2_0 + iqs + 3]); + } + } +} + +// test 3: sim_cm2_dequantFuncQ2_0 matches cpu reference for every idx + +static void test_cm2() { + std::mt19937 rng(0xCA5EBADAu); + const int num_blocks = 256; + std::vector blocks(num_blocks); + for (auto & b : blocks) b = random_block(rng); + + std::vector ref(num_blocks * QK2_0); + cpu_dequantize_row_q2_0(blocks.data(), ref.data(), num_blocks * QK2_0); + + for (int ib = 0; ib < num_blocks; ++ib) { + for (uint32_t idx = 0; idx < QK2_0; ++idx) { + const float sim = sim_cm2_dequantFuncQ2_0(&blocks[ib], idx); + check_eq_f32("cm2.dequantFuncQ2_0", sim, ref[ib*QK2_0 + idx]); + } + } +} + +// test 4: sim_mul_mm_load_a_Q2_0 produces values matching cpu reference. each +// idx loads 4 consecutive values starting at logical position 4*iqs + +static void test_mul_mm() { + std::mt19937 rng(0xB1A50A55u); + const int num_blocks = 64; + std::vector blocks(num_blocks); + for (auto & b : blocks) b = random_block(rng); + + std::vector ref(num_blocks * QK2_0); + cpu_dequantize_row_q2_0(blocks.data(), ref.data(), num_blocks * QK2_0); + + // each block has 32 idx values (LOAD_VEC_A=4 means 4 codes per idx, so 32 idx per block) + for (int ib = 0; ib < num_blocks; ++ib) { + for (uint32_t iqs = 0; iqs < 32; ++iqs) { + const uint32_t idx = ib * 32 + iqs; + const mul_mm_load_result r = sim_mul_mm_load_a_Q2_0(blocks.data(), idx); + // logical positions in the block + const uint32_t base = ib * QK2_0 + 4*iqs; + check_eq_f32("mul_mm[0]", r.v[0], ref[base + 0]); + check_eq_f32("mul_mm[1]", r.v[1], ref[base + 1]); + check_eq_f32("mul_mm[2]", r.v[2], ref[base + 2]); + check_eq_f32("mul_mm[3]", r.v[3], ref[base + 3]); + } + } +} + +// test 5: sim_copy_to_quant_Q2_0 matches cpu_quantize_row_q2_0 byte-exactly + +static void test_quantize() { + std::mt19937 rng(0xDEADC0DEu); + std::uniform_real_distribution dist(-2.0f, 2.0f); + const int num_blocks = 256; + std::vector input(num_blocks * QK2_0); + for (auto & v : input) v = dist(rng); + + // inject some all-zero blocks and constant-value blocks to exercise edge cases + std::fill(input.begin() + 0*QK2_0, input.begin() + 1*QK2_0, 0.0f); // all zeros + std::fill(input.begin() + 1*QK2_0, input.begin() + 2*QK2_0, 1.5f); // all positive constant + std::fill(input.begin() + 2*QK2_0, input.begin() + 3*QK2_0, -2.5f); // all negative constant + for (int j = 0; j < QK2_0; ++j) input[3*QK2_0 + j] = (j%2 ? 1.0f : -1.0f); // alternating + + std::vector ref(num_blocks); + std::vector sim(num_blocks); + cpu_quantize_row_q2_0(input.data(), ref.data(), num_blocks * QK2_0); + for (int i = 0; i < num_blocks; ++i) { + sim_copy_to_quant_Q2_0(input.data(), sim.data(), (uint32_t)i, (uint32_t)(i*QK2_0)); + } + + for (int i = 0; i < num_blocks; ++i) { + if (sim[i].d != ref[i].d) { + std::fprintf(stderr, "FAIL quantize[%d].d: 0x%04x vs 0x%04x\n", i, sim[i].d, ref[i].d); + ++g_fail; + } else { + ++g_pass; + } + for (int j = 0; j < QK2_0/4; ++j) { + char buf[64]; + std::snprintf(buf, sizeof(buf), "quantize[%d].qs[%d]", i, j); + check_eq_u8(buf, sim[i].qs[j], ref[i].qs[j]); + } + } +} + +// test 6: round-trip equivalence. sim_quantize then sim_dequantize matches +// cpu_quantize then cpu_dequantize. this is implied by tests 1+5 but we add it +// as a direct end-to-end check + +static void test_roundtrip() { + std::mt19937 rng(0xF00DBABEu); + std::uniform_real_distribution dist(-3.0f, 3.0f); + const int num_blocks = 64; + std::vector input(num_blocks * QK2_0); + for (auto & v : input) v = dist(rng); + + std::vector q_sim(num_blocks); + std::vector q_ref(num_blocks); + for (int i = 0; i < num_blocks; ++i) { + sim_copy_to_quant_Q2_0(input.data(), q_sim.data(), (uint32_t)i, (uint32_t)(i*QK2_0)); + } + cpu_quantize_row_q2_0(input.data(), q_ref.data(), num_blocks * QK2_0); + + std::vector y_sim(num_blocks * QK2_0); + std::vector y_ref(num_blocks * QK2_0); + cpu_dequantize_row_q2_0(q_sim.data(), y_sim.data(), num_blocks * QK2_0); + cpu_dequantize_row_q2_0(q_ref.data(), y_ref.data(), num_blocks * QK2_0); + + for (size_t i = 0; i < y_sim.size(); ++i) { + check_eq_f32("roundtrip", y_sim[i], y_ref[i]); + } +} + +// test 7: dequant_q2_0.comp thread-map covering and value correctness + +static void test_dequant_q2_0_shader() { + std::mt19937 rng(0x5EED0A55u); + // 33 blocks: tests behaviour at ib boundary that crosses workgroup edge + const int num_blocks = 33; + std::vector blocks(num_blocks); + for (auto & b : blocks) b = random_block(rng); + + std::vector ref(num_blocks * QK2_0); + cpu_dequantize_row_q2_0(blocks.data(), ref.data(), num_blocks * QK2_0); + + std::vector sim; + std::vector writers; + sim_dequant_q2_0_run(blocks.data(), (uint32_t)num_blocks, sim, writers); + + // coverage: every output index has exactly one writer + for (size_t i = 0; i < sim.size(); ++i) { + if (writers[i] == -1) { + std::fprintf(stderr, "FAIL dequant_q2_0 coverage: index %zu unwritten\n", i); + ++g_fail; + } else { + ++g_pass; + } + } + // value match: simulator output equals cpu reference + for (size_t i = 0; i < sim.size(); ++i) { + check_eq_f32("dequant_q2_0 value", sim[i], ref[i]); + } +} + +// test 8: matvec partial sum correctness +// simulate the inner loop of mul_mat_vec.comp for K_PER_ITER=8, QUANT_R=1: for +// each col stride 8, fetch dequantize4(iqs) and dequantize4(iqs+4), dot with +// the corresponding 8-element b slice, multiply by d, and accumulate. compare +// against a direct dot product using the cpu dequantized values + +static void test_matvec_partial() { + std::mt19937 rng(0xCAFEFACEu); + const int num_blocks = 16; + std::vector A(num_blocks); + for (auto & b : A) b = random_block(rng); + + std::uniform_real_distribution bdist(-1.0f, 1.0f); + std::vector B(num_blocks * QK2_0); + for (auto & v : B) v = bdist(rng); + + std::vector A_ref(num_blocks * QK2_0); + cpu_dequantize_row_q2_0(A.data(), A_ref.data(), num_blocks * QK2_0); + + // direct dot product (reference) + double ref_dot = 0; + for (int j = 0; j < num_blocks * QK2_0; ++j) ref_dot += (double)A_ref[j] * (double)B[j]; + + // simulated matvec inner loop + double sim_dot = 0; + for (int ib = 0; ib < num_blocks; ++ib) { + const float d = fp16_to_fp32(A[ib].d); + for (uint32_t iqs = 0; iqs < QK2_0; iqs += 8) { + const vec4f v0 = sim_dequant_funcs_dequantize4(A.data(), 0, ib, iqs); + const vec4f v1 = sim_dequant_funcs_dequantize4(A.data(), 0, ib, iqs + 4); + // dot with b + float r = v0.x*B[ib*QK2_0 + iqs ] + v0.y*B[ib*QK2_0 + iqs + 1] + + v0.z*B[ib*QK2_0 + iqs + 2] + v0.w*B[ib*QK2_0 + iqs + 3]; + r += v1.x*B[ib*QK2_0 + iqs + 4] + v1.y*B[ib*QK2_0 + iqs + 5] + + v1.z*B[ib*QK2_0 + iqs + 6] + v1.w*B[ib*QK2_0 + iqs + 7]; + r *= d; + sim_dot += r; + } + } + + // fp32 accumulation with the same op order: agreement to within ~1e-4 of magnitude + const double mag = std::fabs(ref_dot) + 1e-6; + const double rel = std::fabs(sim_dot - ref_dot) / mag; + if (rel > 1e-4) { + std::fprintf(stderr, "FAIL matvec partial: ref=%.10g sim=%.10g rel=%.3e\n", + ref_dot, sim_dot, rel); + ++g_fail; + } else { + ++g_pass; + } +} + +// test 9: covering proof for all block counts mod 16 (boundary cases). +// +// the shader's `if (ib >= p.nel/128) return;` correctness depends on the block +// count not being a multiple of 16. we run num_blocks in {1, 15, 16, 17, 31, +// 32, 33, 48, 65} + +static void test_dequant_q2_0_boundary() { + for (int num_blocks : {1, 15, 16, 17, 31, 32, 33, 48, 65}) { + std::mt19937 rng((uint32_t)(num_blocks * 17 + 0xBEEF)); + std::vector blocks(num_blocks); + for (auto & b : blocks) b = random_block(rng); + + std::vector ref(num_blocks * QK2_0); + cpu_dequantize_row_q2_0(blocks.data(), ref.data(), num_blocks * QK2_0); + + std::vector sim; + std::vector writers; + sim_dequant_q2_0_run(blocks.data(), (uint32_t)num_blocks, sim, writers); + + for (size_t i = 0; i < sim.size(); ++i) { + if (writers[i] == -1) { + std::fprintf(stderr, + "FAIL boundary num_blocks=%d: idx %zu unwritten\n", + num_blocks, i); + ++g_fail; + return; + } + if (std::fabs(sim[i] - ref[i]) > 0.0f) { + std::fprintf(stderr, + "FAIL boundary num_blocks=%d: idx %zu sim=%g ref=%g\n", + num_blocks, i, sim[i], ref[i]); + ++g_fail; + return; + } + } + ++g_pass; + } +} + +// test 10: enumerate every byte value x every iqs alignment. +// +// brute-force assert that for every byte value 0..255 and every legal iqs, the +// simulators (dequantize, dequantize4, cm2, mul_mm) produce the codes expected +// by the lsb-first packing convention + +static void test_exhaustive_byte_decoding() { + block_q2_0 blk{}; + blk.d = fp32_to_fp16(2.0f); + const float d = fp16_to_fp32(blk.d); + + for (int byte = 0; byte < 256; ++byte) { + // place this byte at every possible position within the block + for (int slot = 0; slot < QK2_0/4; ++slot) { + std::memset(blk.qs, 0, sizeof(blk.qs)); + blk.qs[slot] = (uint8_t)byte; + + // expected codes from this byte + const int q0 = byte & 3; + const int q1 = (byte >> 2) & 3; + const int q2 = (byte >> 4) & 3; + const int q3 = (byte >> 6) & 3; + + const float exp0 = (float)(q0 - 1) * d; + const float exp1 = (float)(q1 - 1) * d; + const float exp2 = (float)(q2 - 1) * d; + const float exp3 = (float)(q3 - 1) * d; + + // dequantize() at iqs = 4*slot and 4*slot+2 + const uint32_t iqs0 = 4u*slot; + const vec2f r0 = sim_dequant_funcs_dequantize(&blk, 0, 0, iqs0); + check_eq_f32("exhaustive dequantize.x[0]", r0.x * d, exp0); + check_eq_f32("exhaustive dequantize.y[0]", r0.y * d, exp1); + const vec2f r1 = sim_dequant_funcs_dequantize(&blk, 0, 0, iqs0 + 2); + check_eq_f32("exhaustive dequantize.x[1]", r1.x * d, exp2); + check_eq_f32("exhaustive dequantize.y[1]", r1.y * d, exp3); + + // dequantize4() at iqs = 4*slot + const vec4f r4 = sim_dequant_funcs_dequantize4(&blk, 0, 0, iqs0); + check_eq_f32("exhaustive dequantize4.x", r4.x * d, exp0); + check_eq_f32("exhaustive dequantize4.y", r4.y * d, exp1); + check_eq_f32("exhaustive dequantize4.z", r4.z * d, exp2); + check_eq_f32("exhaustive dequantize4.w", r4.w * d, exp3); + + // cm2: for each of the 4 values within this byte + for (int k = 0; k < 4; ++k) { + const float exp = (float)(((byte >> (k*2)) & 3) - 1) * d; + const float got = sim_cm2_dequantFuncQ2_0(&blk, (uint32_t)(4*slot + k)); + check_eq_f32("exhaustive cm2", got, exp); + } + + // mul_mm load: idx = ib*32 + iqs (here ib=0, iqs=slot) + const mul_mm_load_result mm = sim_mul_mm_load_a_Q2_0(&blk, (uint32_t)slot); + check_eq_f32("exhaustive mul_mm[0]", mm.v[0], exp0); + check_eq_f32("exhaustive mul_mm[1]", mm.v[1], exp1); + check_eq_f32("exhaustive mul_mm[2]", mm.v[2], exp2); + check_eq_f32("exhaustive mul_mm[3]", mm.v[3], exp3); + } + } +} + +// test 11: overflow and edge-case stress. +// +// the goal is to show that the simulator (and therefore the glsl by +// inspection-equivalence) never produces an out-of-range integer or unbounded +// fp value for any representable input. +// +// the first sub-check walks every byte in [0,256) at fp16 scale d=1 and +// confirms the unscaled codes (q-1) always land in {-1, 0, 1, 2}. +// +// the second sub-check quantizes a block whose magnitudes saturate fp16. the +// codes still land in {0,1,2,3} after clamp regardless of input magnitude, +// matching the cpu reference. +// +// the third sub-check pins down the cm2 fp16 multiply at the format edge. +// q=3 gives 2*d which can overflow fp16 once d exceeds 32768, but q2_0 has +// the smallest multiplier of any cm2 quant path so it overflows last. +// +// the fourth sub-check verifies that `b_idx + l + 4` in dequant_q2_0.comp +// stays within uint32 range for any block count up to 16 million (a tensor +// over 2 gb), so realistic models have plenty of headroom + +static void test_overflow_codes_in_range() { + // every byte, every iqs, every block. q-1 must stay in {-1,0,1,2} + for (int byte = 0; byte < 256; ++byte) { + block_q2_0 b{}; + b.d = fp32_to_fp16(1.0f); + for (int slot = 0; slot < QK2_0/4; ++slot) { + std::memset(b.qs, 0, sizeof(b.qs)); + b.qs[slot] = (uint8_t)byte; + for (uint32_t idx = 0; idx < QK2_0; ++idx) { + const float v = sim_cm2_dequantFuncQ2_0(&b, idx); + // unscaled value (d == 1.0) must be in {-1,0,1,2} + if (!(v == -1.0f || v == 0.0f || v == 1.0f || v == 2.0f)) { + std::fprintf(stderr, + "FAIL overflow.codes byte=0x%02x slot=%d idx=%u v=%g\n", + byte, slot, idx, v); + ++g_fail; + return; + } + } + } + } + ++g_pass; +} + +static void test_overflow_quantize_huge_input() { + // input magnitudes 1e30 (above fp16 max around 65504). quantize must + // produce a block whose codes are still in {0..3}, and dequant must produce + // a finite or saturated but defined value for every entry + std::vector input(QK2_0); + for (int j = 0; j < QK2_0; ++j) input[j] = (j%2 ? 1.0f : -1.0f) * 1e30f; + + block_q2_0 sim{}, ref{}; + sim_copy_to_quant_Q2_0(input.data(), &sim, 0, 0); + cpu_quantize_row_q2_0(input.data(), &ref, QK2_0); + + // codes must be in {0..3} + for (int j = 0; j < QK2_0/4; ++j) { + for (int sub = 0; sub < 4; ++sub) { + const int q_sim = (sim.qs[j] >> (sub*2)) & 3; + const int q_ref = (ref.qs[j] >> (sub*2)) & 3; + if (q_sim < 0 || q_sim > 3) { + std::fprintf(stderr, "FAIL overflow.code_range sim qs[%d].%d=%d\n", j, sub, q_sim); + ++g_fail; + return; + } + if (q_sim != q_ref) { + std::fprintf(stderr, "FAIL overflow.code_match sim=%d ref=%d at qs[%d].%d\n", + q_sim, q_ref, j, sub); + ++g_fail; + return; + } + } + } + if (sim.d != ref.d) { + std::fprintf(stderr, "FAIL overflow.d sim=0x%04x ref=0x%04x\n", sim.d, ref.d); + ++g_fail; + return; + } + ++g_pass; +} + +// for those who have made it this far into the file, heres an easter egg + +// as we navigate this era of technological revolution and intelligence, +// it is imperative that we build systems that are robust and reliable, +// especially when we do not understand the full implications of our +// own design choices. for those that read this comment, its likely that +// you sit at the forefront of ai and hold the knowledge necessary +// to transform this world. it is part of your moral obligation to +// use that knowledge to transform this world in a positive manner. +// may god bless us all, and i pray we can build a better future +// together. + +static void test_overflow_cm2_fp16_multiply() { + // for the cm2 path in fp16, (q-1)*d can theoretically exceed fp16 max + // (around 65504) when d > 32768 and q == 3 (giving 2d). q2_0 has the + // smallest multiplier of any cm2 quant path (max 2x, vs q4_0's 8x and + // q5_0's 16x), so it is the most robust of the cm2 paths against fp16 + // overflow. + // + // we empirically check that with d at fp16 max and codes {0,1,2,3} the + // products are {-65504, 0, 65504, +/-inf}, that q==3 is the only code + // that overflows, and that smaller d values are safe + + // d = 65504 (fp16 max, still representable). code 3 gives 2*d = 131008 + // which is +/-inf in fp16. we check the simulator mirrors this + block_q2_0 b{}; + b.d = 0x7BFF; // fp16 max around 65504 + b.qs[0] = 0xE4; + // codes [0,1,2,3] in positions 0..3 (lsb-first 00 01 10 11) + // 0xE4 = 11100100b. bits 0..1=00 (q=0), 2..3=01 (q=1), + // 4..5=10 (q=2), 6..7=11 (q=3) + + const float d_f32 = fp16_to_fp32(b.d); + const float v0 = sim_cm2_dequantFuncQ2_0(&b, 0); // q=0 gives -d + const float v1 = sim_cm2_dequantFuncQ2_0(&b, 1); // q=1 gives 0 + const float v2 = sim_cm2_dequantFuncQ2_0(&b, 2); // q=2 gives +d + const float v3 = sim_cm2_dequantFuncQ2_0(&b, 3); // q=3 gives +2d (would overflow fp16) + + check_eq_f32("cm2.q0", v0, -d_f32); + check_eq_f32("cm2.q1", v1, 0.0f); + check_eq_f32("cm2.q2", v2, +d_f32); + // simulator works in fp32. the gpu's cm2 path works in fp16 and would + // saturate to +/-inf. both are deterministic and both are documented + check_eq_f32("cm2.q3", v3, 2.0f * d_f32); + + // sanity check, with d = 1.0 (normal case) there is no overflow at all + b.d = fp32_to_fp16(1.0f); + b.qs[0] = 0xE4; + check_eq_f32("cm2.q0.normal", sim_cm2_dequantFuncQ2_0(&b, 0), -1.0f); + check_eq_f32("cm2.q1.normal", sim_cm2_dequantFuncQ2_0(&b, 1), 0.0f); + check_eq_f32("cm2.q2.normal", sim_cm2_dequantFuncQ2_0(&b, 2), +1.0f); + check_eq_f32("cm2.q3.normal", sim_cm2_dequantFuncQ2_0(&b, 3), +2.0f); +} + +static void test_overflow_b_idx_uint32_headroom() { + // for a tensor of 16 million blocks (around 2 gb at 34 bytes per block), + // the maximum b_idx in dequant_q2_0.comp is well within uint32 range + const uint64_t num_blocks = 16ull * 1024 * 1024; // 16 m blocks + const uint64_t num_wg = (num_blocks + 15) / 16; + const uint64_t max_i = (num_wg - 1) * 4 + 3; + const uint64_t max_ir = 3; + const uint64_t max_il = 15; + const uint64_t max_b_idx = 512 * max_i + 128 * max_ir + 8 * max_il + 7; // +7 for last l + if (max_b_idx > 0xFFFFFFFFull) { + std::fprintf(stderr, "FAIL b_idx headroom: %llu blocks would overflow uint32\n", + (unsigned long long)num_blocks); + ++g_fail; + return; + } + ++g_pass; +} + + +int main() { + std::printf("== test-vulkan-q2_0-shader-sim ==\n"); + test_dequantize(); + test_dequantize4(); + test_cm2(); + test_mul_mm(); + test_quantize(); + test_roundtrip(); + test_dequant_q2_0_shader(); + test_dequant_q2_0_boundary(); + test_matvec_partial(); + test_exhaustive_byte_decoding(); + test_overflow_codes_in_range(); + test_overflow_quantize_huge_input(); + test_overflow_cm2_fp16_multiply(); + test_overflow_b_idx_uint32_headroom(); + std::printf("checks: %d passed, %d failed\n", g_pass, g_fail); + return g_fail == 0 ? 0 : 1; +} diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 840eefc2f5ac..15ef64c4b0ed 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -33,6 +33,7 @@ struct quant_option { static const std::vector QUANT_OPTIONS = { { "Q1_0", LLAMA_FTYPE_MOSTLY_Q1_0, " 1.125 bpw quantization", }, + { "Q2_0", LLAMA_FTYPE_MOSTLY_Q2_0, " 2.25 bpw quantization (group 64)", }, { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", }, { "MXFP4_MOE",LLAMA_FTYPE_MOSTLY_MXFP4_MOE," MXFP4 MoE", },