Multithread the k4 * f32 matrix multiplication.

3 years ago · 40121e1c82
parent b8946da2d8
commit 40121e1c82
2 changed files with 175 additions and 159 deletions
--- a/src/simd_support.rs
+++ b/src/simd_support.rs
@ -314,9 +314,11 @@ pub fn horizontal_sum_and_f32_to_f16(mut ymm: __m256) -> f16 {

 /// Prints a binary representation of i16x8 to stdout in this form:
 ///
+/// ```ignore
 ///       0                 0                 0               0
 ///     0x0000           0x0000           0x0000           0x0000
 /// 0000000000000000 0000000000000000 0000000000000000 0000000000000000 etc.
+/// ```
 ///
 /// decimal on first line, hex on second, binary on third.
 pub fn print_i16x8(a: I16x8) {
--- a/src/tensor.rs
+++ b/src/tensor.rs
@ -1187,10 +1187,6 @@ impl Tensor {
            let self_cols: usize = self.cols as usize;
            let self_cols_capacity: usize = self.capacity_cols as usize;

-            let src_data: *const u8 = src.data;
-            let other_data: *const f32 = other.data as *const f32;
-            let tgt_data: *mut f32 = self.data as *mut f32;
-
            // src_cols_its == also the shared dimension between src and other.
            let src_cols_its = if src_cols % 32 == 0 {
                src_cols / 32
@ -1199,13 +1195,30 @@ impl Tensor {
            };
            debug_assert!(!src.q4_data.is_null());

+            let src_data_wrap: WrappedPtr = WrappedPtr::wrap(src.data);
+            let other_data: WrappedPtr = WrappedPtr::wrap(other.data);
+            let tgt_data: WrappedPtr = WrappedPtr::wrap(self.data);
+            let src_q4_data: WrappedPtr = WrappedPtr::wrap(src.q4_data);
+
+            let nthreads: usize = rayon::current_num_threads();
+            (0..nthreads).into_par_iter().for_each(|thread_idx| {
+                let src_q4_data: *const u8 = src_q4_data.unwrap() as *const u8;
+                let src_data: *const u8 = src_data_wrap.unwrap() as *const u8;
+                let other_data: *const f32 = other_data.unwrap() as *const f32;
+                let tgt_data: *mut f32 = tgt_data.unwrap() as *mut f32;
+
                for row in 0..self_rows {
-                let quant0 = load_i16x8(src.q4_data.add(row * 32) as *const I16x8);
-                let quant1 = load_i16x8(src.q4_data.add(row * 32 + 16) as *const I16x8);
+                    let quant0 = load_i16x8(src_q4_data.add(row * 32) as *const I16x8);
+                    let quant1 = load_i16x8(src_q4_data.add(row * 32 + 16) as *const I16x8);
                    let quants: [F32x8; 2] =
                        [i16x8_as_f16_to_f32x8(quant0), i16x8_as_f16_to_f32x8(quant1)];

                    for col in 0..self_cols {
+                        let row_col = row * self_cols + col;
+                        if row_col % nthreads != thread_idx {
+                            continue;
+                        }
+
                        #[inline]
                        fn load_f32(
                            other: *const f32,
@ -1363,6 +1376,7 @@ impl Tensor {
                        *tgt_data.add(row * self_cols_capacity + col) = target;
                    }
                }
+            });
        }
    }