Rename to_gpu and to_cpu to to_gpu_inplace and to_cpu_inplace to make _inplace use consistent.

master
Mikko Juola 3 years ago
parent 1c5ec04217
commit 8aef5d8831

@ -14,16 +14,34 @@ pub fn opencl_benchmarks(c: &mut Criterion) {
let cl = OpenCL::new(false, 0).unwrap();
let mut mul_left = Tensor::random(1024, 1024, TensorDType::Float16);
mul_left.to_gpu(&cl).unwrap();
mul_left.to_gpu_inplace(&cl).unwrap();
let mut mul_right = Tensor::random(1024, 1024, TensorDType::Float16);
mul_right.to_gpu(&cl).unwrap();
mul_right.to_gpu_inplace(&cl).unwrap();
let mut mul_target = Tensor::zeros(1024, 1024, TensorDType::Float16);
mul_target.to_gpu(&cl).unwrap();
mul_target.to_gpu_inplace(&cl).unwrap();
let mut mul_left_cpu = Tensor::random(1024, 1024, TensorDType::Float32);
let mut mul_right_cpu = Tensor::random(1024, 1024, TensorDType::Float32);
let mut mul_target_cpu = Tensor::random(1024, 1024, TensorDType::Float32);
let mut mul_left1 = Tensor::random(4096, 11000, TensorDType::Float16);
let mut mul_right1 = Tensor::random(1, 11000, TensorDType::Float16);
let mut mul_target1 = Tensor::zeros(4096, 1, TensorDType::Float16);
mul_left1.to_gpu_inplace(&cl).unwrap();
mul_right1.to_gpu_inplace(&cl).unwrap();
mul_target1.to_gpu_inplace(&cl).unwrap();
c.bench_function(
"4096x11000 to 1x11000 matrix multiplication transposed on OpenCL",
|b| {
b.iter(|| {
mul_target1
.matrix_mul_inplace_transposed(black_box(&mul_left1), black_box(&mul_right1));
mul_target1.finish();
})
},
);
c.bench_function(
"1024x1024 matrix multiplication transposed on OpenCL",
|b| {
@ -43,24 +61,24 @@ pub fn opencl_benchmarks(c: &mut Criterion) {
c.bench_function("1x1 matrix from CPU to OpenCL device and back", |b| {
b.iter(|| {
let _ = orig1.to_gpu(&cl).unwrap();
let _ = orig1.to_cpu();
let _ = orig1.to_gpu_inplace(&cl).unwrap();
let _ = orig1.to_cpu_inplace();
orig1.finish();
})
});
c.bench_function("1024x1024 matrix from CPU to OpenCL device and back", |b| {
b.iter(|| {
let _ = orig16.to_gpu(&cl).unwrap();
let _ = orig16.to_cpu();
let _ = orig16.to_gpu_inplace(&cl).unwrap();
let _ = orig16.to_cpu_inplace();
orig16.finish();
})
});
c.bench_function("4096x4096 matrix from CPU to OpenCL device and back", |b| {
b.iter(|| {
let _ = orig32.to_gpu(&cl).unwrap();
let _ = orig32.to_cpu();
let _ = orig32.to_gpu_inplace(&cl).unwrap();
let _ = orig32.to_cpu_inplace();
orig32.finish();
})
});

@ -1,3 +1,22 @@
/*
*
* Tensors for RLLaMA
*
* This is not a general Tensor library; but it has just enough to run the transformers in LLaMA
* model.
*
*
* The main structure you work with is Tensor, which is a 2D matrix. All Tensors here are 2D
* matrices with no flexibility.
*
* Tensors can be 16-bit, 32-bit and they can be on OpenCL or on the CPU.
*
* Operations have this naming convention:
*
* If it's "to_XXX", then it returns a new tensor in the specified format.
* If it's "XXX_inplace", then it has a &mut self and it modifies the tensor in place.
*/
#[cfg(feature = "opencl")]
use crate::tensor_opencl_support::{OpenCL, OpenCLError, OpenCLEvent, OpenCLTensor};
use crate::unpickler;
@ -587,7 +606,7 @@ impl Tensor {
// TODO: do not create a CPU-side copy
let result = unsafe { Tensor::uninitialized(self.rows, self.cols, self.dtype) };
let mut result = result.to_f16();
result.to_gpu(&cl).unwrap();
result.to_gpu_inplace(&cl).unwrap();
result.with_opencl_data_mut(|tgt_tensor| {
tgt_tensor.copy_inplace(self_tensor).unwrap();
other.with_opencl_data(|other_tensor| {
@ -689,7 +708,7 @@ impl Tensor {
// TODO: don't generate a CPU-side copy, create the result directly on OpenCL side
let mut result = unsafe { Tensor::uninitialized(self.rows, self.cols, self.dtype) };
result = result.to_f16();
result.to_gpu(&cl).unwrap();
result.to_gpu_inplace(&cl).unwrap();
result.with_opencl_data_mut(|tgt_tensor| {
tgt_tensor.copy_inplace(src_tensor).unwrap();
tgt_tensor.silu_inplace().unwrap();
@ -733,7 +752,7 @@ impl Tensor {
// TODO: don't generate a CPU-side copy, create the result directly on OpenCL side
let mut result = unsafe { Tensor::uninitialized(self.cols, self.rows, self.dtype) };
result = result.to_f16();
result.to_gpu(&cl).unwrap();
result.to_gpu_inplace(&cl).unwrap();
result.with_opencl_data_mut(|tgt_tensor| {
tgt_tensor.transpose_from(src_tensor).unwrap();
});
@ -817,7 +836,7 @@ impl Tensor {
#[cfg(feature = "opencl")]
if self.is_on_gpu() {
let od = self.opencl_data.write().unwrap();
result.to_gpu(&od.as_ref().unwrap().cl()).unwrap();
result.to_gpu_inplace(&od.as_ref().unwrap().cl()).unwrap();
}
result.matrix_mul_inplace_transposed(self, other);
@ -1427,14 +1446,14 @@ impl Tensor {
///
/// The tensor is moved asynchronously.
#[cfg(feature = "opencl")]
pub fn to_gpu(&mut self, cl: &OpenCL) -> Result<(), TensorError> {
pub fn to_gpu_inplace(&mut self, cl: &OpenCL) -> Result<(), TensorError> {
self.process_waiting_for_data_mut();
let mut od = self.opencl_data.write().unwrap();
if od.is_some() {
return Ok(());
}
if self.dtype != TensorDType::Float16 {
panic!("to_gpu: Only float16 tensors are supported on the GPU");
panic!("to_gpu_inplace: Only float16 tensors are supported on the GPU");
}
let cl_tensor = cl.data_u16_to_gpu(
self.data as *const u16,
@ -1481,7 +1500,7 @@ impl Tensor {
/// Sends a tensor from the GPU to the CPU. This is a no-op if the tensor is already on the
/// CPU.
#[cfg(feature = "opencl")]
pub fn to_cpu(&mut self) -> Result<(), TensorError> {
pub fn to_cpu_inplace(&mut self) -> Result<(), TensorError> {
self.process_waiting_for_data_mut();
let mut od = self.opencl_data.write().unwrap();
if od.is_none() {
@ -1489,7 +1508,7 @@ impl Tensor {
}
let data = unsafe { std::alloc::alloc(self.layout) };
if data.is_null() {
panic!("to_cpu: Failed to allocate tensor");
panic!("to_cpu_inplace: Failed to allocate tensor");
}
let ev = od.as_mut().unwrap().data_u16_from_gpu(data as *mut u16)?;
self.data = data as *mut u16 as *mut u8;
@ -2162,12 +2181,12 @@ mod tests {
let mut b2 = b.to_f16();
let mut c = Tensor::random(512, 768, TensorDType::Float32);
let mut c2 = Tensor::zeros(512, 768, TensorDType::Float32).to_f16();
a2.to_gpu(&cl).unwrap();
b2.to_gpu(&cl).unwrap();
c2.to_gpu(&cl).unwrap();
a2.to_gpu_inplace(&cl).unwrap();
b2.to_gpu_inplace(&cl).unwrap();
c2.to_gpu_inplace(&cl).unwrap();
c.matrix_mul_inplace_transposed(&a, &b);
c2.matrix_mul_inplace_transposed(&a2, &b2);
c2.to_cpu().unwrap();
c2.to_cpu_inplace().unwrap();
assert_eq!(c.rows(), c2.rows());
assert_eq!(c.cols(), c2.cols());
@ -2189,12 +2208,12 @@ mod tests {
let mut b2 = b.to_f16();
let mut c = Tensor::random(1024, 1024, TensorDType::Float32);
let mut c2 = Tensor::zeros(1024, 1024, TensorDType::Float32).to_f16();
a2.to_gpu(&cl).unwrap();
b2.to_gpu(&cl).unwrap();
c2.to_gpu(&cl).unwrap();
a2.to_gpu_inplace(&cl).unwrap();
b2.to_gpu_inplace(&cl).unwrap();
c2.to_gpu_inplace(&cl).unwrap();
c.matrix_mul_inplace_transposed(&a, &b);
c2.matrix_mul_inplace_transposed(&a2, &b2);
c2.to_cpu().unwrap();
c2.to_cpu_inplace().unwrap();
assert_eq!(c.rows(), c2.rows());
assert_eq!(c.cols(), c2.cols());
@ -2218,11 +2237,11 @@ mod tests {
let mat1 = Tensor::random(a, b, TensorDType::Float16);
let mat2 = mat1.clone();
let mut mat2 = mat2.to_f16();
mat2.to_gpu(&cl).unwrap();
mat2.to_gpu_inplace(&cl).unwrap();
let mat1_result = mat1.silu();
let mut mat2_result = mat2.silu();
mat2_result.to_cpu().unwrap();
mat2_result.to_cpu_inplace().unwrap();
assert_eq!(mat1_result.rows(), mat2_result.rows());
assert_eq!(mat1_result.cols(), mat2_result.cols());
@ -2253,12 +2272,12 @@ mod tests {
let mut mat1_gpu = mat1.to_f16();
let mut mat2_gpu = mat2.to_f16();
mat1_gpu.to_gpu(&cl).unwrap();
mat2_gpu.to_gpu(&cl).unwrap();
mat1_gpu.to_gpu_inplace(&cl).unwrap();
mat2_gpu.to_gpu_inplace(&cl).unwrap();
let result1 = mat1.hadamard_product(&mat2);
let mut result2 = mat1_gpu.hadamard_product(&mat2_gpu);
result2.to_cpu().unwrap();
result2.to_cpu_inplace().unwrap();
assert_eq!(result1.rows(), result2.rows());
assert_eq!(result1.cols(), result2.cols());
@ -2285,11 +2304,11 @@ mod tests {
let b = rng.gen_range(1..=100);
let mat1 = Tensor::random(a, b, TensorDType::Float16);
let mut mat1_gpu = mat1.to_f16();
mat1_gpu.to_gpu(&cl).unwrap();
mat1_gpu.to_gpu_inplace(&cl).unwrap();
let mat1_transposed = mat1.transpose();
let mut mat1_gpu_transposed = mat1_gpu.transpose();
mat1_gpu_transposed.to_cpu().unwrap();
mat1_gpu_transposed.to_cpu_inplace().unwrap();
assert_eq!(mat1_transposed.rows(), mat1_gpu_transposed.rows());
assert_eq!(mat1_transposed.cols(), mat1_gpu_transposed.cols());
@ -2323,9 +2342,52 @@ mod tests {
let mut mat1_gpu = mat1.clone();
let mut mat2_gpu = mat2.clone();
let mut mat3_gpu = mat3.clone();
mat1_gpu.to_gpu(&cl).unwrap();
mat2_gpu.to_gpu(&cl).unwrap();
mat3_gpu.to_gpu(&cl).unwrap();
mat1_gpu.to_gpu_inplace(&cl).unwrap();
mat2_gpu.to_gpu_inplace(&cl).unwrap();
mat3_gpu.to_gpu_inplace(&cl).unwrap();
let mat1 = mat1.to_f32();
let mat2 = mat2.to_f32();
let mut mat3 = mat3.to_f32();
mat3.matrix_mul_inplace_transposed(&mat1, &mat2);
mat3_gpu.matrix_mul_inplace_transposed(&mat1_gpu, &mat2_gpu);
mat3_gpu.to_cpu_inplace().unwrap();
assert_eq!(mat3.rows(), mat3_gpu.rows());
assert_eq!(mat3.cols(), mat3_gpu.cols());
for row in 0..mat3.rows {
for col in 0..mat3.cols {
assert_relative_eq!(
mat3.get_f32(row, col),
mat3_gpu.get_f32(row, col),
epsilon = 1e-2,
);
}
}
}
}
#[cfg(feature = "opencl")]
#[test]
fn gpu_matrix_mul_vector_transposed_is_close_to_cpu_matrix_mul_vector_transposed() {
let cl = OpenCL::new(false, 0).unwrap();
let mut rng = rand::thread_rng();
for _trial in 0..300 {
let a = rng.gen_range(1..=300);
let b = rng.gen_range(1..=300);
let mat1 = Tensor::random(a, b, TensorDType::Float16);
let mat2 = Tensor::random(1, b, TensorDType::Float16);
let mat3 = Tensor::random(a, 1, TensorDType::Float16);
let mut mat1_gpu = mat1.clone();
let mut mat2_gpu = mat2.clone();
let mut mat3_gpu = mat3.clone();
mat1_gpu.to_gpu_inplace(&cl).unwrap();
mat2_gpu.to_gpu_inplace(&cl).unwrap();
mat3_gpu.to_gpu_inplace(&cl).unwrap();
let mat1 = mat1.to_f32();
let mat2 = mat2.to_f32();
@ -2333,7 +2395,7 @@ mod tests {
mat3.matrix_mul_inplace_transposed(&mat1, &mat2);
mat3_gpu.matrix_mul_inplace_transposed(&mat1_gpu, &mat2_gpu);
mat3_gpu.to_cpu().unwrap();
mat3_gpu.to_cpu_inplace().unwrap();
assert_eq!(mat3.rows(), mat3_gpu.rows());
assert_eq!(mat3.cols(), mat3_gpu.cols());

@ -314,12 +314,19 @@ impl OpenCLTensor {
);
}
// Clear out the target memory
// Clear out the target memory.
unsafe { self.buf.cmd().fill(0u16, None).block(false).enq()? };
let prg = self.cl.programs.write().unwrap();
let prg = if self.cl.is_cpu_device {
// 0 = CPU optimized
// 1 = GPU optimized
// 2 = GPU optimized vector multiply (other.rows == 1)
const CPU: u8 = 0;
const GPU: u8 = 1;
let strategy: u8 = if self.cl.is_cpu_device { CPU } else { GPU };
let prg = if strategy == CPU {
&prg.matrix_mul_transposed_f16_cpu_optimized
} else {
&prg.matrix_mul_transposed_f16
@ -347,14 +354,14 @@ impl OpenCLTensor {
};
unsafe {
if self.cl.is_cpu_device {
if strategy == CPU {
let b = prg
.cmd()
.queue(&self.queue)
.global_work_size([self.cols as usize, self.rows as usize])
.enew(&mut event);
b.enq()?;
} else {
} else if strategy == GPU {
let b = prg
.cmd()
.queue(&self.queue)
@ -362,6 +369,13 @@ impl OpenCLTensor {
.local_work_size([16, 16])
.enew(&mut event);
b.enq()?;
} else {
let b = prg
.cmd()
.queue(&self.queue)
.global_work_size([self.cols as usize, self.rows as usize])
.enew(&mut event);
b.enq()?;
}
}
self.last_event = Some(event.clone());

@ -407,9 +407,9 @@ impl FeedForward {
w2 = w2.to_f16();
w3 = w3.to_f16();
let ds = data_settings.clone();
w1.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
w2.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
w3.to_gpu(&ds.cl.unwrap()).unwrap();
w1.to_gpu_inplace(&ds.cl.as_ref().unwrap().clone()).unwrap();
w2.to_gpu_inplace(&ds.cl.as_ref().unwrap().clone()).unwrap();
w3.to_gpu_inplace(&ds.cl.unwrap()).unwrap();
}
}
#[cfg(not(feature = "opencl"))]
@ -435,7 +435,8 @@ impl FeedForward {
x_was_on_cpu = x.is_on_cpu();
if self.data_settings.use_opencl_for_feedforward {
*x = x.to_f16();
x.to_gpu(self.data_settings.cl.as_ref().unwrap()).unwrap();
x.to_gpu_inplace(self.data_settings.cl.as_ref().unwrap())
.unwrap();
}
}
let (w1_out, w3_out) = rayon::join(
@ -457,7 +458,7 @@ impl FeedForward {
{
let mut result = self.w2.matrix_mul_transposed(&w1w3_out);
if x_was_on_cpu {
result.to_cpu().unwrap();
result.to_cpu_inplace().unwrap();
result
} else {
result
@ -510,10 +511,10 @@ impl Attention {
wv = wv.to_f16();
wo = wo.to_f16();
let ds = data_settings.clone();
wq.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
wk.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
wv.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
wo.to_gpu(&ds.cl.unwrap()).unwrap();
wq.to_gpu_inplace(&ds.cl.as_ref().unwrap().clone()).unwrap();
wk.to_gpu_inplace(&ds.cl.as_ref().unwrap().clone()).unwrap();
wv.to_gpu_inplace(&ds.cl.as_ref().unwrap().clone()).unwrap();
wo.to_gpu_inplace(&ds.cl.unwrap()).unwrap();
}
}
#[cfg(not(feature = "opencl"))]
@ -550,7 +551,8 @@ impl Attention {
x_was_on_cpu = x.is_on_cpu();
if self.data_settings.use_opencl_for_attention {
*x = x.to_f16();
x.to_gpu(self.data_settings.cl.as_ref().unwrap()).unwrap();
x.to_gpu_inplace(self.data_settings.cl.as_ref().unwrap())
.unwrap();
}
}
@ -560,9 +562,9 @@ impl Attention {
let mut xq_out = x.matrix_mul_transposed(&self.wq);
let mut xk_out = x.matrix_mul_transposed(&self.wk);
let mut xv_out = x.matrix_mul_transposed(&self.wv);
xq_out.to_cpu().unwrap();
xk_out.to_cpu().unwrap();
xv_out.to_cpu().unwrap();
xq_out.to_cpu_inplace().unwrap();
xk_out.to_cpu_inplace().unwrap();
xv_out.to_cpu_inplace().unwrap();
(xq_out.to_f32(), xk_out.to_f32(), xv_out.to_f32())
};
@ -673,10 +675,10 @@ impl Attention {
.to_f16();
if self.wo.is_on_gpu() {
xq_row
.to_gpu(&self.data_settings.cl.as_ref().unwrap())
.to_gpu_inplace(&self.data_settings.cl.as_ref().unwrap())
.unwrap();
let mut result = xq_row.matrix_mul_transposed(&self.wo);
result.to_cpu().unwrap();
result.to_cpu_inplace().unwrap();
result.to_f32()
} else {
xq_row.matrix_mul_transposed(&self.wo)

Loading…
Cancel
Save