|
|
|
|
@ -1187,10 +1187,6 @@ impl Tensor {
|
|
|
|
|
let self_cols: usize = self.cols as usize;
|
|
|
|
|
let self_cols_capacity: usize = self.capacity_cols as usize;
|
|
|
|
|
|
|
|
|
|
let src_data: *const u8 = src.data;
|
|
|
|
|
let other_data: *const f32 = other.data as *const f32;
|
|
|
|
|
let tgt_data: *mut f32 = self.data as *mut f32;
|
|
|
|
|
|
|
|
|
|
// src_cols_its == also the shared dimension between src and other.
|
|
|
|
|
let src_cols_its = if src_cols % 32 == 0 {
|
|
|
|
|
src_cols / 32
|
|
|
|
|
@ -1199,13 +1195,30 @@ impl Tensor {
|
|
|
|
|
};
|
|
|
|
|
debug_assert!(!src.q4_data.is_null());
|
|
|
|
|
|
|
|
|
|
let src_data_wrap: WrappedPtr = WrappedPtr::wrap(src.data);
|
|
|
|
|
let other_data: WrappedPtr = WrappedPtr::wrap(other.data);
|
|
|
|
|
let tgt_data: WrappedPtr = WrappedPtr::wrap(self.data);
|
|
|
|
|
let src_q4_data: WrappedPtr = WrappedPtr::wrap(src.q4_data);
|
|
|
|
|
|
|
|
|
|
let nthreads: usize = rayon::current_num_threads();
|
|
|
|
|
(0..nthreads).into_par_iter().for_each(|thread_idx| {
|
|
|
|
|
let src_q4_data: *const u8 = src_q4_data.unwrap() as *const u8;
|
|
|
|
|
let src_data: *const u8 = src_data_wrap.unwrap() as *const u8;
|
|
|
|
|
let other_data: *const f32 = other_data.unwrap() as *const f32;
|
|
|
|
|
let tgt_data: *mut f32 = tgt_data.unwrap() as *mut f32;
|
|
|
|
|
|
|
|
|
|
for row in 0..self_rows {
|
|
|
|
|
let quant0 = load_i16x8(src.q4_data.add(row * 32) as *const I16x8);
|
|
|
|
|
let quant1 = load_i16x8(src.q4_data.add(row * 32 + 16) as *const I16x8);
|
|
|
|
|
let quant0 = load_i16x8(src_q4_data.add(row * 32) as *const I16x8);
|
|
|
|
|
let quant1 = load_i16x8(src_q4_data.add(row * 32 + 16) as *const I16x8);
|
|
|
|
|
let quants: [F32x8; 2] =
|
|
|
|
|
[i16x8_as_f16_to_f32x8(quant0), i16x8_as_f16_to_f32x8(quant1)];
|
|
|
|
|
|
|
|
|
|
for col in 0..self_cols {
|
|
|
|
|
let row_col = row * self_cols + col;
|
|
|
|
|
if row_col % nthreads != thread_idx {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[inline]
|
|
|
|
|
fn load_f32(
|
|
|
|
|
other: *const f32,
|
|
|
|
|
@ -1363,6 +1376,7 @@ impl Tensor {
|
|
|
|
|
*tgt_data.add(row * self_cols_capacity + col) = target;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|