Rename to_gpu and to_cpu to to_gpu_inplace and to_cpu_inplace to make _inplace use consistent.

3 years ago · 8aef5d8831
parent 1c5ec04217
commit 8aef5d8831
4 changed files with 151 additions and 55 deletions
--- a/src/benches/benchmark.rs
+++ b/src/benches/benchmark.rs
@ -14,16 +14,34 @@ pub fn opencl_benchmarks(c: &mut Criterion) {
    let cl = OpenCL::new(false, 0).unwrap();

    let mut mul_left = Tensor::random(1024, 1024, TensorDType::Float16);
-    mul_left.to_gpu(&cl).unwrap();
+    mul_left.to_gpu_inplace(&cl).unwrap();
    let mut mul_right = Tensor::random(1024, 1024, TensorDType::Float16);
-    mul_right.to_gpu(&cl).unwrap();
+    mul_right.to_gpu_inplace(&cl).unwrap();
    let mut mul_target = Tensor::zeros(1024, 1024, TensorDType::Float16);
-    mul_target.to_gpu(&cl).unwrap();
+    mul_target.to_gpu_inplace(&cl).unwrap();

    let mut mul_left_cpu = Tensor::random(1024, 1024, TensorDType::Float32);
    let mut mul_right_cpu = Tensor::random(1024, 1024, TensorDType::Float32);
    let mut mul_target_cpu = Tensor::random(1024, 1024, TensorDType::Float32);

+    let mut mul_left1 = Tensor::random(4096, 11000, TensorDType::Float16);
+    let mut mul_right1 = Tensor::random(1, 11000, TensorDType::Float16);
+    let mut mul_target1 = Tensor::zeros(4096, 1, TensorDType::Float16);
+    mul_left1.to_gpu_inplace(&cl).unwrap();
+    mul_right1.to_gpu_inplace(&cl).unwrap();
+    mul_target1.to_gpu_inplace(&cl).unwrap();
+
+    c.bench_function(
+        "4096x11000 to 1x11000 matrix multiplication transposed on OpenCL",
+        |b| {
+            b.iter(|| {
+                mul_target1
+                    .matrix_mul_inplace_transposed(black_box(&mul_left1), black_box(&mul_right1));
+                mul_target1.finish();
+            })
+        },
+    );
+
    c.bench_function(
        "1024x1024 matrix multiplication transposed on OpenCL",
        |b| {
@ -43,24 +61,24 @@ pub fn opencl_benchmarks(c: &mut Criterion) {

    c.bench_function("1x1 matrix from CPU to OpenCL device and back", |b| {
        b.iter(|| {
-            let _ = orig1.to_gpu(&cl).unwrap();
-            let _ = orig1.to_cpu();
+            let _ = orig1.to_gpu_inplace(&cl).unwrap();
+            let _ = orig1.to_cpu_inplace();
            orig1.finish();
        })
    });

    c.bench_function("1024x1024 matrix from CPU to OpenCL device and back", |b| {
        b.iter(|| {
-            let _ = orig16.to_gpu(&cl).unwrap();
-            let _ = orig16.to_cpu();
+            let _ = orig16.to_gpu_inplace(&cl).unwrap();
+            let _ = orig16.to_cpu_inplace();
            orig16.finish();
        })
    });

    c.bench_function("4096x4096 matrix from CPU to OpenCL device and back", |b| {
        b.iter(|| {
-            let _ = orig32.to_gpu(&cl).unwrap();
-            let _ = orig32.to_cpu();
+            let _ = orig32.to_gpu_inplace(&cl).unwrap();
+            let _ = orig32.to_cpu_inplace();
            orig32.finish();
        })
    });
--- a/src/tensor.rs
+++ b/src/tensor.rs
@ -1,3 +1,22 @@
+/*
+ *
+ * Tensors for RLLaMA
+ *
+ * This is not a general Tensor library; but it has just enough to run the transformers in LLaMA
+ * model.
+ *
+ *
+ * The main structure you work with is Tensor, which is a 2D matrix. All Tensors here are 2D
+ * matrices with no flexibility.
+ *
+ * Tensors can be 16-bit, 32-bit and they can be on OpenCL or on the CPU.
+ *
+ * Operations have this naming convention:
+ *
+ *   If it's "to_XXX", then it returns a new tensor in the specified format.
+ *   If it's "XXX_inplace", then it has a &mut self and it modifies the tensor in place.
+ */
+
 #[cfg(feature = "opencl")]
 use crate::tensor_opencl_support::{OpenCL, OpenCLError, OpenCLEvent, OpenCLTensor};
 use crate::unpickler;
@ -587,7 +606,7 @@ impl Tensor {
            // TODO: do not create a CPU-side copy
            let result = unsafe { Tensor::uninitialized(self.rows, self.cols, self.dtype) };
            let mut result = result.to_f16();
-            result.to_gpu(&cl).unwrap();
+            result.to_gpu_inplace(&cl).unwrap();
            result.with_opencl_data_mut(|tgt_tensor| {
                tgt_tensor.copy_inplace(self_tensor).unwrap();
                other.with_opencl_data(|other_tensor| {
@ -689,7 +708,7 @@ impl Tensor {
            // TODO: don't generate a CPU-side copy, create the result directly on OpenCL side
            let mut result = unsafe { Tensor::uninitialized(self.rows, self.cols, self.dtype) };
            result = result.to_f16();
-            result.to_gpu(&cl).unwrap();
+            result.to_gpu_inplace(&cl).unwrap();
            result.with_opencl_data_mut(|tgt_tensor| {
                tgt_tensor.copy_inplace(src_tensor).unwrap();
                tgt_tensor.silu_inplace().unwrap();
@ -733,7 +752,7 @@ impl Tensor {
            // TODO: don't generate a CPU-side copy, create the result directly on OpenCL side
            let mut result = unsafe { Tensor::uninitialized(self.cols, self.rows, self.dtype) };
            result = result.to_f16();
-            result.to_gpu(&cl).unwrap();
+            result.to_gpu_inplace(&cl).unwrap();
            result.with_opencl_data_mut(|tgt_tensor| {
                tgt_tensor.transpose_from(src_tensor).unwrap();
            });
@ -817,7 +836,7 @@ impl Tensor {
        #[cfg(feature = "opencl")]
        if self.is_on_gpu() {
            let od = self.opencl_data.write().unwrap();
-            result.to_gpu(&od.as_ref().unwrap().cl()).unwrap();
+            result.to_gpu_inplace(&od.as_ref().unwrap().cl()).unwrap();
        }

        result.matrix_mul_inplace_transposed(self, other);
@ -1427,14 +1446,14 @@ impl Tensor {
    ///
    /// The tensor is moved asynchronously.
    #[cfg(feature = "opencl")]
-    pub fn to_gpu(&mut self, cl: &OpenCL) -> Result<(), TensorError> {
+    pub fn to_gpu_inplace(&mut self, cl: &OpenCL) -> Result<(), TensorError> {
        self.process_waiting_for_data_mut();
        let mut od = self.opencl_data.write().unwrap();
        if od.is_some() {
            return Ok(());
        }
        if self.dtype != TensorDType::Float16 {
-            panic!("to_gpu: Only float16 tensors are supported on the GPU");
+            panic!("to_gpu_inplace: Only float16 tensors are supported on the GPU");
        }
        let cl_tensor = cl.data_u16_to_gpu(
            self.data as *const u16,
@ -1481,7 +1500,7 @@ impl Tensor {
    /// Sends a tensor from the GPU to the CPU. This is a no-op if the tensor is already on the
    /// CPU.
    #[cfg(feature = "opencl")]
-    pub fn to_cpu(&mut self) -> Result<(), TensorError> {
+    pub fn to_cpu_inplace(&mut self) -> Result<(), TensorError> {
        self.process_waiting_for_data_mut();
        let mut od = self.opencl_data.write().unwrap();
        if od.is_none() {
@ -1489,7 +1508,7 @@ impl Tensor {
        }
        let data = unsafe { std::alloc::alloc(self.layout) };
        if data.is_null() {
-            panic!("to_cpu: Failed to allocate tensor");
+            panic!("to_cpu_inplace: Failed to allocate tensor");
        }
        let ev = od.as_mut().unwrap().data_u16_from_gpu(data as *mut u16)?;
        self.data = data as *mut u16 as *mut u8;
@ -2162,12 +2181,12 @@ mod tests {
        let mut b2 = b.to_f16();
        let mut c = Tensor::random(512, 768, TensorDType::Float32);
        let mut c2 = Tensor::zeros(512, 768, TensorDType::Float32).to_f16();
-        a2.to_gpu(&cl).unwrap();
-        b2.to_gpu(&cl).unwrap();
-        c2.to_gpu(&cl).unwrap();
+        a2.to_gpu_inplace(&cl).unwrap();
+        b2.to_gpu_inplace(&cl).unwrap();
+        c2.to_gpu_inplace(&cl).unwrap();
        c.matrix_mul_inplace_transposed(&a, &b);
        c2.matrix_mul_inplace_transposed(&a2, &b2);
-        c2.to_cpu().unwrap();
+        c2.to_cpu_inplace().unwrap();

        assert_eq!(c.rows(), c2.rows());
        assert_eq!(c.cols(), c2.cols());
@ -2189,12 +2208,12 @@ mod tests {
        let mut b2 = b.to_f16();
        let mut c = Tensor::random(1024, 1024, TensorDType::Float32);
        let mut c2 = Tensor::zeros(1024, 1024, TensorDType::Float32).to_f16();
-        a2.to_gpu(&cl).unwrap();
-        b2.to_gpu(&cl).unwrap();
-        c2.to_gpu(&cl).unwrap();
+        a2.to_gpu_inplace(&cl).unwrap();
+        b2.to_gpu_inplace(&cl).unwrap();
+        c2.to_gpu_inplace(&cl).unwrap();
        c.matrix_mul_inplace_transposed(&a, &b);
        c2.matrix_mul_inplace_transposed(&a2, &b2);
-        c2.to_cpu().unwrap();
+        c2.to_cpu_inplace().unwrap();

        assert_eq!(c.rows(), c2.rows());
        assert_eq!(c.cols(), c2.cols());
@ -2218,11 +2237,11 @@ mod tests {
            let mat1 = Tensor::random(a, b, TensorDType::Float16);
            let mat2 = mat1.clone();
            let mut mat2 = mat2.to_f16();
-            mat2.to_gpu(&cl).unwrap();
+            mat2.to_gpu_inplace(&cl).unwrap();

            let mat1_result = mat1.silu();
            let mut mat2_result = mat2.silu();
-            mat2_result.to_cpu().unwrap();
+            mat2_result.to_cpu_inplace().unwrap();

            assert_eq!(mat1_result.rows(), mat2_result.rows());
            assert_eq!(mat1_result.cols(), mat2_result.cols());
@ -2253,12 +2272,12 @@ mod tests {

            let mut mat1_gpu = mat1.to_f16();
            let mut mat2_gpu = mat2.to_f16();
-            mat1_gpu.to_gpu(&cl).unwrap();
-            mat2_gpu.to_gpu(&cl).unwrap();
+            mat1_gpu.to_gpu_inplace(&cl).unwrap();
+            mat2_gpu.to_gpu_inplace(&cl).unwrap();

            let result1 = mat1.hadamard_product(&mat2);
            let mut result2 = mat1_gpu.hadamard_product(&mat2_gpu);
-            result2.to_cpu().unwrap();
+            result2.to_cpu_inplace().unwrap();

            assert_eq!(result1.rows(), result2.rows());
            assert_eq!(result1.cols(), result2.cols());
@ -2285,11 +2304,11 @@ mod tests {
            let b = rng.gen_range(1..=100);
            let mat1 = Tensor::random(a, b, TensorDType::Float16);
            let mut mat1_gpu = mat1.to_f16();
-            mat1_gpu.to_gpu(&cl).unwrap();
+            mat1_gpu.to_gpu_inplace(&cl).unwrap();

            let mat1_transposed = mat1.transpose();
            let mut mat1_gpu_transposed = mat1_gpu.transpose();
-            mat1_gpu_transposed.to_cpu().unwrap();
+            mat1_gpu_transposed.to_cpu_inplace().unwrap();

            assert_eq!(mat1_transposed.rows(), mat1_gpu_transposed.rows());
            assert_eq!(mat1_transposed.cols(), mat1_gpu_transposed.cols());
@ -2323,9 +2342,52 @@ mod tests {
            let mut mat1_gpu = mat1.clone();
            let mut mat2_gpu = mat2.clone();
            let mut mat3_gpu = mat3.clone();
-            mat1_gpu.to_gpu(&cl).unwrap();
-            mat2_gpu.to_gpu(&cl).unwrap();
-            mat3_gpu.to_gpu(&cl).unwrap();
+            mat1_gpu.to_gpu_inplace(&cl).unwrap();
+            mat2_gpu.to_gpu_inplace(&cl).unwrap();
+            mat3_gpu.to_gpu_inplace(&cl).unwrap();
+
+            let mat1 = mat1.to_f32();
+            let mat2 = mat2.to_f32();
+            let mut mat3 = mat3.to_f32();
+
+            mat3.matrix_mul_inplace_transposed(&mat1, &mat2);
+            mat3_gpu.matrix_mul_inplace_transposed(&mat1_gpu, &mat2_gpu);
+            mat3_gpu.to_cpu_inplace().unwrap();
+
+            assert_eq!(mat3.rows(), mat3_gpu.rows());
+            assert_eq!(mat3.cols(), mat3_gpu.cols());
+
+            for row in 0..mat3.rows {
+                for col in 0..mat3.cols {
+                    assert_relative_eq!(
+                        mat3.get_f32(row, col),
+                        mat3_gpu.get_f32(row, col),
+                        epsilon = 1e-2,
+                    );
+                }
+            }
+        }
+    }
+
+    #[cfg(feature = "opencl")]
+    #[test]
+    fn gpu_matrix_mul_vector_transposed_is_close_to_cpu_matrix_mul_vector_transposed() {
+        let cl = OpenCL::new(false, 0).unwrap();
+        let mut rng = rand::thread_rng();
+
+        for _trial in 0..300 {
+            let a = rng.gen_range(1..=300);
+            let b = rng.gen_range(1..=300);
+
+            let mat1 = Tensor::random(a, b, TensorDType::Float16);
+            let mat2 = Tensor::random(1, b, TensorDType::Float16);
+            let mat3 = Tensor::random(a, 1, TensorDType::Float16);
+            let mut mat1_gpu = mat1.clone();
+            let mut mat2_gpu = mat2.clone();
+            let mut mat3_gpu = mat3.clone();
+            mat1_gpu.to_gpu_inplace(&cl).unwrap();
+            mat2_gpu.to_gpu_inplace(&cl).unwrap();
+            mat3_gpu.to_gpu_inplace(&cl).unwrap();

            let mat1 = mat1.to_f32();
            let mat2 = mat2.to_f32();
@ -2333,7 +2395,7 @@ mod tests {

            mat3.matrix_mul_inplace_transposed(&mat1, &mat2);
            mat3_gpu.matrix_mul_inplace_transposed(&mat1_gpu, &mat2_gpu);
-            mat3_gpu.to_cpu().unwrap();
+            mat3_gpu.to_cpu_inplace().unwrap();

            assert_eq!(mat3.rows(), mat3_gpu.rows());
            assert_eq!(mat3.cols(), mat3_gpu.cols());
--- a/src/tensor_opencl_support.rs
+++ b/src/tensor_opencl_support.rs
@ -314,12 +314,19 @@ impl OpenCLTensor {
            );
        }

-        // Clear out the target memory
+        // Clear out the target memory.
        unsafe { self.buf.cmd().fill(0u16, None).block(false).enq()? };

        let prg = self.cl.programs.write().unwrap();

-        let prg = if self.cl.is_cpu_device {
+        // 0 = CPU optimized
+        // 1 = GPU optimized
+        // 2 = GPU optimized vector multiply (other.rows == 1)
+        const CPU: u8 = 0;
+        const GPU: u8 = 1;
+        let strategy: u8 = if self.cl.is_cpu_device { CPU } else { GPU };
+
+        let prg = if strategy == CPU {
            &prg.matrix_mul_transposed_f16_cpu_optimized
        } else {
            &prg.matrix_mul_transposed_f16
@ -347,14 +354,14 @@ impl OpenCLTensor {
        };

        unsafe {
-            if self.cl.is_cpu_device {
+            if strategy == CPU {
                let b = prg
                    .cmd()
                    .queue(&self.queue)
                    .global_work_size([self.cols as usize, self.rows as usize])
                    .enew(&mut event);
                b.enq()?;
-            } else {
+            } else if strategy == GPU {
                let b = prg
                    .cmd()
                    .queue(&self.queue)
@ -362,6 +369,13 @@ impl OpenCLTensor {
                    .local_work_size([16, 16])
                    .enew(&mut event);
                b.enq()?;
+            } else {
+                let b = prg
+                    .cmd()
+                    .queue(&self.queue)
+                    .global_work_size([self.cols as usize, self.rows as usize])
+                    .enew(&mut event);
+                b.enq()?;
            }
        }
        self.last_event = Some(event.clone());
--- a/src/transformer.rs
+++ b/src/transformer.rs
@ -407,9 +407,9 @@ impl FeedForward {
                w2 = w2.to_f16();
                w3 = w3.to_f16();
                let ds = data_settings.clone();
-                w1.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
-                w2.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
-                w3.to_gpu(&ds.cl.unwrap()).unwrap();
+                w1.to_gpu_inplace(&ds.cl.as_ref().unwrap().clone()).unwrap();
+                w2.to_gpu_inplace(&ds.cl.as_ref().unwrap().clone()).unwrap();
+                w3.to_gpu_inplace(&ds.cl.unwrap()).unwrap();
            }
        }
        #[cfg(not(feature = "opencl"))]
@ -435,7 +435,8 @@ impl FeedForward {
            x_was_on_cpu = x.is_on_cpu();
            if self.data_settings.use_opencl_for_feedforward {
                *x = x.to_f16();
-                x.to_gpu(self.data_settings.cl.as_ref().unwrap()).unwrap();
+                x.to_gpu_inplace(self.data_settings.cl.as_ref().unwrap())
+                    .unwrap();
            }
        }
        let (w1_out, w3_out) = rayon::join(
@ -457,7 +458,7 @@ impl FeedForward {
        {
            let mut result = self.w2.matrix_mul_transposed(&w1w3_out);
            if x_was_on_cpu {
-                result.to_cpu().unwrap();
+                result.to_cpu_inplace().unwrap();
                result
            } else {
                result
@ -510,10 +511,10 @@ impl Attention {
                wv = wv.to_f16();
                wo = wo.to_f16();
                let ds = data_settings.clone();
-                wq.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
-                wk.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
-                wv.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
-                wo.to_gpu(&ds.cl.unwrap()).unwrap();
+                wq.to_gpu_inplace(&ds.cl.as_ref().unwrap().clone()).unwrap();
+                wk.to_gpu_inplace(&ds.cl.as_ref().unwrap().clone()).unwrap();
+                wv.to_gpu_inplace(&ds.cl.as_ref().unwrap().clone()).unwrap();
+                wo.to_gpu_inplace(&ds.cl.unwrap()).unwrap();
            }
        }
        #[cfg(not(feature = "opencl"))]
@ -550,7 +551,8 @@ impl Attention {
            x_was_on_cpu = x.is_on_cpu();
            if self.data_settings.use_opencl_for_attention {
                *x = x.to_f16();
-                x.to_gpu(self.data_settings.cl.as_ref().unwrap()).unwrap();
+                x.to_gpu_inplace(self.data_settings.cl.as_ref().unwrap())
+                    .unwrap();
            }
        }

@ -560,9 +562,9 @@ impl Attention {
            let mut xq_out = x.matrix_mul_transposed(&self.wq);
            let mut xk_out = x.matrix_mul_transposed(&self.wk);
            let mut xv_out = x.matrix_mul_transposed(&self.wv);
-            xq_out.to_cpu().unwrap();
-            xk_out.to_cpu().unwrap();
-            xv_out.to_cpu().unwrap();
+            xq_out.to_cpu_inplace().unwrap();
+            xk_out.to_cpu_inplace().unwrap();
+            xv_out.to_cpu_inplace().unwrap();
            (xq_out.to_f32(), xk_out.to_f32(), xv_out.to_f32())
        };

@ -673,10 +675,10 @@ impl Attention {
                        .to_f16();
                    if self.wo.is_on_gpu() {
                        xq_row
-                            .to_gpu(&self.data_settings.cl.as_ref().unwrap())
+                            .to_gpu_inplace(&self.data_settings.cl.as_ref().unwrap())
                            .unwrap();
                        let mut result = xq_row.matrix_mul_transposed(&self.wo);
-                        result.to_cpu().unwrap();
+                        result.to_cpu_inplace().unwrap();
                        result.to_f32()
                    } else {
                        xq_row.matrix_mul_transposed(&self.wo)