Bucketize the 4-bit quantization for more accuracy.

Currently going with 512 columns per bucket. Need to test a bit should I go even smaller, 256 columns per bucket.
3 years ago · d7d13cd474
parent 8cc82ae7e2
commit d7d13cd474
2 changed files with 215 additions and 128 deletions
--- a/src/tensor.rs
+++ b/src/tensor.rs
@ -91,7 +91,11 @@ pub struct Tensor {
    data: *mut u8,

    // for quantization, only used if dtype == TensorDType::K4BitQuantization
-    // Contains 16 values per row in f16 (i.e. 32 bytes per row)
+    // q4_data is (NxM) where N is:
+    //
+    //   ((cols + 511) / 512) * 32   (i.e. 32 bytes for every 512 columns, rounded up)
+    //
+    // and M is number of rows in the tensor.
    q4_data: *mut u8,

    #[cfg(feature = "opencl")]
@ -345,11 +349,9 @@ impl Tensor {
                        BitSide::Lower => (addr_val & 0x0F),
                    }
                };
-                let table: I16x8 = if quant_val <= 7 {
-                    unsafe { load_i16x8(self.q4_data.add(row as usize * 32) as *const I16x8) }
-                } else {
-                    unsafe { load_i16x8(self.q4_data.add(row as usize * 32 + 16) as *const I16x8) }
-                };
+
+                let (table1, table2) = self.q4_lookup_table(row, col);
+                let table = if quant_val <= 7 { table1 } else { table2 };
                let table = i16x8_as_f16_to_f32x8(table);
                f32x8_get(table, (quant_val % 8) as usize)
            }
@ -1261,11 +1263,6 @@ impl Tensor {
                            continue;
                        }

-                        let quant0 = load_i16x8(other_q4_data.add(col * 32) as *const I16x8);
-                        let quant1 = load_i16x8(other_q4_data.add(col * 32 + 16) as *const I16x8);
-                        let quants: [F32x8; 2] =
-                            [i16x8_as_f16_to_f32x8(quant0), i16x8_as_f16_to_f32x8(quant1)];
-
                        #[inline]
                        fn load_f32(
                            src: *const f32,
@ -1360,6 +1357,10 @@ impl Tensor {
                            [f32x8_zero(), f32x8_zero(), f32x8_zero(), f32x8_zero()];

                        for p in 0..src_cols_its {
+                            let (quant0, quant1) = other.q4_lookup_table(col as i64, p as i64 * 32);
+                            let quants: [F32x8; 2] =
+                                [i16x8_as_f16_to_f32x8(quant0), i16x8_as_f16_to_f32x8(quant1)];
+
                            let (other8_0, other8_1, other8_2, other8_3) =
                                load_k4_to_f32(&other, col, p * 32, other_rows, quants.as_ptr());
                            let src8_0 = load_f32(
@ -1454,11 +1455,6 @@ impl Tensor {
                let tgt_data: *mut f32 = tgt_data.unwrap() as *mut f32;

                for row in 0..self_rows {
-                    let quant0 = load_i16x8(src_q4_data.add(row * 32) as *const I16x8);
-                    let quant1 = load_i16x8(src_q4_data.add(row * 32 + 16) as *const I16x8);
-                    let quants: [F32x8; 2] =
-                        [i16x8_as_f16_to_f32x8(quant0), i16x8_as_f16_to_f32x8(quant1)];
-
                    for col_raw in 0..self_cols_its {
                        let row_col = row * self_cols_its + col_raw;
                        if row_col % nthreads != thread_idx {
@ -1567,6 +1563,10 @@ impl Tensor {
                        ];

                        for p in 0..src_cols_its {
+                            let (quant0, quant1) = src.q4_lookup_table(row as i64, p as i64 * 32);
+                            let quants: [F32x8; 2] =
+                                [i16x8_as_f16_to_f32x8(quant0), i16x8_as_f16_to_f32x8(quant1)];
+
                            // Macro to make code shorter
                            macro_rules! lo {
                                ($col:expr, $p:expr) => {
@ -1693,11 +1693,6 @@ impl Tensor {
                let other_data: *const u8 = other_data.unwrap() as *const u8;
                let tgt_data: *mut f32 = tgt_data.unwrap() as *mut f32;

-                let quant0 = load_i16x8(other_q4_data.add(0) as *const I16x8);
-                let quant1 = load_i16x8(other_q4_data.add(16) as *const I16x8);
-                let quants: [F32x8; 2] =
-                    [i16x8_as_f16_to_f32x8(quant0), i16x8_as_f16_to_f32x8(quant1)];
-
                let col = 0;
                for row in 0..self_rows {
                    if row % nthreads != thread_idx {
@ -1797,6 +1792,10 @@ impl Tensor {
                        [f32x8_zero(), f32x8_zero(), f32x8_zero(), f32x8_zero()];

                    for p in 0..src_cols_its {
+                        let (quant0, quant1) = other.q4_lookup_table(col as i64, p as i64 * 32);
+                        let quants: [F32x8; 2] =
+                            [i16x8_as_f16_to_f32x8(quant0), i16x8_as_f16_to_f32x8(quant1)];
+
                        let (other8_0, other8_1, other8_2, other8_3) =
                            load_k4_to_f32(&other, col, p * 32, other_rows, quants.as_ptr());
                        let src8_0 =
@ -1884,10 +1883,6 @@ impl Tensor {
                    if row % nthreads != thread_idx {
                        continue;
                    }
-                    let quant0 = load_i16x8(src_q4_data.add(row * 32) as *const I16x8);
-                    let quant1 = load_i16x8(src_q4_data.add(row * 32 + 16) as *const I16x8);
-                    let quants: [F32x8; 2] =
-                        [i16x8_as_f16_to_f32x8(quant0), i16x8_as_f16_to_f32x8(quant1)];

                    let col = 0;

@ -1985,6 +1980,9 @@ impl Tensor {
                        [f32x8_zero(), f32x8_zero(), f32x8_zero(), f32x8_zero()];

                    for p in 0..src_cols_its {
+                        let (quant0, quant1) = src.q4_lookup_table(row as i64, p as i64 * 32);
+                        let quants: [F32x8; 2] =
+                            [i16x8_as_f16_to_f32x8(quant0), i16x8_as_f16_to_f32x8(quant1)];
                        let other8_0: F32x8 = load_f32(
                            other_data,
                            col,
@ -2816,9 +2814,12 @@ impl Tensor {
    /// values and quantization lookup table.
    ///
    /// The first function must always return numbers from 0 to 15. These are put in the matrix.
-    /// The second function must return, for each row, 16 floats that correspond the numbers
+    /// The second function must return, for each row and 512-long column block, 16 floats that correspond the numbers
    /// returns from the first function. For this quantization scheme, each row has its own set of
    /// floats.
+    ///
+    /// The second function is called for each (row, column) where column is 0, 512, 1024, etc. so
+    /// that it would cover the entire matrix.
    #[inline]
    pub fn make_k4bit_from_fn<F, F2>(
        rows: i64,
@ -2828,39 +2829,21 @@ impl Tensor {
    ) -> Self
    where
        F: FnMut(i64, i64) -> u8,
-        F2: FnMut(i64) -> [f32; 16],
+        F2: FnMut(i64, i64) -> [f32; 16],
    {
        let mut result =
            unsafe { Tensor::uninitialized(rows, cols, TensorDType::K4BitQuantization) };
        result.allocate_q4_data();
        assert!(!result.q4_data.is_null());

+        let col_blocks = (cols + 511) / 512;
+        let q4_col_capacity = col_blocks as usize * 32;
+
        unsafe {
            for row in 0..rows {
+                for block in 0..col_blocks {
                    // Set the lookup table to
-                let lookup_table = get_lookup_table(row);
-                /*
-                let table1 = f32x8_from_values(
-                    lookup_table[0],
-                    lookup_table[1],
-                    lookup_table[2],
-                    lookup_table[3],
-                    lookup_table[4],
-                    lookup_table[5],
-                    lookup_table[6],
-                    lookup_table[7],
-                );
-                let table2 = f32x8_from_values(
-                    lookup_table[8],
-                    lookup_table[9],
-                    lookup_table[10],
-                    lookup_table[11],
-                    lookup_table[12],
-                    lookup_table[13],
-                    lookup_table[14],
-                    lookup_table[15],
-                );
-                */
+                    let lookup_table = get_lookup_table(row, block * 512);
                    let table1 = f32x8_from_values(
                        lookup_table[7],
                        lookup_table[6],
@ -2884,13 +2867,24 @@ impl Tensor {

                    let table1 = f32x8_to_i16x8_as_f16(table1);
                    let table2 = f32x8_to_i16x8_as_f16(table2);
-                store_i16x8(result.q4_data.add(row as usize * 32) as *mut I16x8, table1);
                    store_i16x8(
-                    result.q4_data.add(row as usize * 32 + 16) as *mut I16x8,
+                        result
+                            .q4_data
+                            .add(row as usize * q4_col_capacity + block as usize * 32)
+                            as *mut I16x8,
+                        table1,
+                    );
+                    store_i16x8(
+                        result
+                            .q4_data
+                            .add(row as usize * q4_col_capacity + block as usize * 32 + 16)
+                            as *mut I16x8,
                        table2,
                    );

-                for col in 0..cols {
+                    let start = block * 512;
+                    let end = std::cmp::min(start + 512, cols);
+                    for col in start..end {
                        let v = get_value(row, col);

                        let (addr, side) = result.q4_address(row, col);
@ -2907,6 +2901,7 @@ impl Tensor {
                    }
                }
            }
+        }
        result
    }

@ -2940,6 +2935,26 @@ impl Tensor {
        }
    }

+    /// K4 bit quantization; loads the quantization table for given row/column
+    #[inline]
+    fn q4_lookup_table(&self, row: i64, col: i64) -> (I16x8, I16x8) {
+        let q4_capacity = (((self.cols + 511) / 512) * 32) as usize;
+        let col_block = col / 512;
+        unsafe {
+            let v1 = load_i16x8(
+                self.q4_data
+                    .add(row as usize * q4_capacity + col_block as usize * 32)
+                    as *const I16x8,
+            );
+            let v2 = load_i16x8(
+                self.q4_data
+                    .add(row as usize * q4_capacity + col_block as usize * 32 + 16)
+                    as *const I16x8,
+            );
+            (v1, v2)
+        }
+    }
+
    fn allocate_q4_data(&mut self) {
        if self.dtype != TensorDType::K4BitQuantization {
            panic!("Can only allocate q4 data for K4BitQuantization");
@ -2949,7 +2964,10 @@ impl Tensor {
            return;
        }

-        let layout = Layout::from_size_align(self.rows as usize * 32, 32).unwrap();
+        let q4_cols_capacity = ((self.cols + 511) / 512) * 32;
+
+        let layout =
+            Layout::from_size_align(self.rows as usize * q4_cols_capacity as usize, 32).unwrap();
        let q4_data = unsafe { std::alloc::alloc_zeroed(layout) };
        if q4_data.is_null() {
            panic!("Failed to allocate q4 data");
@ -4116,7 +4134,7 @@ mod tests {
                16,
                16,
                |_row, col| col as u8,
-                |row| {
+                |row, _col| {
                    let mut result: [f32; 16] = [0.0; 16];
                    for col in 0..16 {
                        result[col] = reference.get_f32(row, col as i64);
@ -4174,7 +4192,7 @@ mod tests {
                a,
                b,
                |row, col| quantized_values[row as usize][col as usize],
-                |row| {
+                |row, _col| {
                    let mut result: [f32; 16] = [0.0; 16];
                    for col in 0..16 {
                        result[col] = quant_values[row as usize][col];
@ -4249,7 +4267,7 @@ mod tests {
                b,
                c,
                |row, col| quantized_values[row as usize][col as usize],
-                |row| {
+                |row, _col| {
                    let mut result: [f32; 16] = [0.0; 16];
                    for col in 0..16 {
                        result[col] = quant_values[row as usize][col];
@ -4325,7 +4343,7 @@ mod tests {
                b,
                c,
                |row, col| quantized_values[row as usize][col as usize],
-                |row| {
+                |row, _col| {
                    let mut result: [f32; 16] = [0.0; 16];
                    for col in 0..16 {
                        result[col] = quant_values[row as usize][col];
--- a/src/weight_compression.rs
+++ b/src/weight_compression.rs
@ -1,5 +1,4 @@
 use crate::tensor::Tensor;
-use rand::{thread_rng, Rng};
 use rayon::prelude::*;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, RwLock};
@ -9,14 +8,21 @@ pub fn quantize(tensor: &Tensor) -> Tensor {
     * This is a simplistic rounding quantizer. It splits each row in a tensor to 16 buckets and
     * takes the average value in said buckets as the quantized weight.
     */
-    let mut allowed_values_by_row: Vec<Vec<f32>> = Vec::with_capacity(tensor.rows() as usize);
+    let mut allowed_values_by_row_block: Vec<Vec<Vec<f32>>> =
+        Vec::with_capacity(tensor.rows() as usize);
+    let col_blocks = (tensor.cols() + 511) / 512;
    for row in 0..tensor.rows() {
-        let mut values: Vec<f32> = Vec::with_capacity(tensor.cols() as usize);
+        let mut block_values: Vec<Vec<f32>> = Vec::with_capacity(col_blocks as usize);
+        for block in 0..col_blocks {
+            let start = block * 512;
+            let end = std::cmp::min(start + 512, tensor.cols());
+
+            let mut values: Vec<f32> = Vec::with_capacity(512);
            values.truncate(0);
            let mut mi: f32 = std::f32::MAX;
            let mut ma: f32 = std::f32::MIN;

-        for col in 0..tensor.cols() {
+            for col in start..end {
                let val = tensor.get_f32(row, col);
                if val < mi {
                    mi = val;
@ -28,7 +34,6 @@ pub fn quantize(tensor: &Tensor) -> Tensor {
            }
            values.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
            let mut allowed_values: Vec<f32> = Vec::with_capacity(16);
-        let mut rng = thread_rng();
            for i in 0..16 {
                let start_idx = i * values.len() / 16;
                let end_idx = (i + 1) * values.len() / 16;
@ -43,15 +48,18 @@ pub fn quantize(tensor: &Tensor) -> Tensor {
            allowed_values[0] = mi;
            allowed_values[15] = ma;
            allowed_values.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
-        allowed_values_by_row.push(allowed_values);
+            block_values.push(allowed_values);
+        }
+        allowed_values_by_row_block.push(block_values);
    }

-    //let mut result = Tensor::zeros(tensor.rows(), tensor.cols(), tensor.dtype());
    let result = Tensor::make_k4bit_from_fn(
        tensor.rows(),
        tensor.cols(),
        |row, col| {
-            let allowed_values: &[f32] = &allowed_values_by_row[row as usize];
+            let col_block = col / 512;
+            let allowed_values: &[f32] =
+                &allowed_values_by_row_block[row as usize][col_block as usize];
            let val = tensor.get_f32(row, col);
            let mut best = 0;
            let mut best_dist = std::f32::MAX;
@ -64,8 +72,9 @@ pub fn quantize(tensor: &Tensor) -> Tensor {
            }
            best as u8
        },
-        |row: i64| -> [f32; 16] {
-            let allowed_values: &[f32] = &allowed_values_by_row[row as usize];
+        |row: i64, col: i64| -> [f32; 16] {
+            let allowed_values: &[f32] =
+                &allowed_values_by_row_block[row as usize][col as usize / 512];
            let mut result: [f32; 16] = [0.0; 16];
            for i in 0..16 {
                result[i] = allowed_values[i];
@ -75,3 +84,63 @@ pub fn quantize(tensor: &Tensor) -> Tensor {
    );
    result
 }
+
+// Same as quantize but doesn't actually change the type of the tensor. It just changes the tensor
+// itself. Used to test new quantization schemes without writing support for them.
+pub fn quantize_test(tensor: &Tensor) -> Tensor {
+    let mut result = Tensor::zeros(tensor.rows(), tensor.cols(), tensor.dtype());
+    for row in 0..tensor.rows() {
+        let col_blocks = (tensor.cols() + 511) / 512;
+        for block in 0..col_blocks {
+            let mut values: Vec<f32> = Vec::with_capacity(tensor.cols() as usize);
+            values.truncate(0);
+            let mut mi: f32 = std::f32::MAX;
+            let mut ma: f32 = std::f32::MIN;
+
+            let start = block * 512;
+            let end = std::cmp::min(start + 512, tensor.cols());
+
+            for col in start..end {
+                let val = tensor.get_f32(row, col);
+                if val < mi {
+                    mi = val;
+                }
+                if val > ma {
+                    ma = val;
+                }
+                values.push(val);
+            }
+            values.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
+            let mut allowed_values: Vec<f32> = Vec::with_capacity(16);
+            for i in 0..16 {
+                let start_idx = i * values.len() / 16;
+                let end_idx = (i + 1) * values.len() / 16;
+
+                let mut avg = 0.0;
+                for j in start_idx..end_idx {
+                    avg += values[j];
+                }
+                avg /= (end_idx - start_idx) as f32;
+                allowed_values.push(avg);
+            }
+            allowed_values[0] = mi;
+            allowed_values[15] = ma;
+            allowed_values.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
+
+            for col in start..end {
+                let val = tensor.get_f32(row, col);
+                let mut best = 0;
+                let mut best_dist = std::f32::MAX;
+                for i in 0..16 {
+                    let dist = (val - allowed_values[i] as f32).abs();
+                    if dist < best_dist {
+                        best = i;
+                        best_dist = dist;
+                    }
+                }
+                result.set_f32(row, col, allowed_values[best as usize]);
+            }
+        }
+    }
+    result
+}