From 8aef5d8831bf57e3ef11b964a9be108a3573de7b Mon Sep 17 00:00:00 2001
From: Mikko Juola <mikjuo@gmail.com>
Date: Wed, 15 Mar 2023 11:45:15 -0700
Subject: [PATCH] Rename to_gpu and to_cpu to to_gpu_inplace and to_cpu_inplace
 to make _inplace use consistent.

---
 src/benches/benchmark.rs     |  36 ++++++++---
 src/tensor.rs                | 116 +++++++++++++++++++++++++++--------
 src/tensor_opencl_support.rs |  22 +++++--
 src/transformer.rs           |  32 +++++-----
 4 files changed, 151 insertions(+), 55 deletions(-)

diff --git a/src/benches/benchmark.rs b/src/benches/benchmark.rs
index aba10fc..9917f25 100644
--- a/src/benches/benchmark.rs
+++ b/src/benches/benchmark.rs
@@ -14,16 +14,34 @@ pub fn opencl_benchmarks(c: &mut Criterion) {
     let cl = OpenCL::new(false, 0).unwrap();
 
     let mut mul_left = Tensor::random(1024, 1024, TensorDType::Float16);
-    mul_left.to_gpu(&cl).unwrap();
+    mul_left.to_gpu_inplace(&cl).unwrap();
     let mut mul_right = Tensor::random(1024, 1024, TensorDType::Float16);
-    mul_right.to_gpu(&cl).unwrap();
+    mul_right.to_gpu_inplace(&cl).unwrap();
     let mut mul_target = Tensor::zeros(1024, 1024, TensorDType::Float16);
-    mul_target.to_gpu(&cl).unwrap();
+    mul_target.to_gpu_inplace(&cl).unwrap();
 
     let mut mul_left_cpu = Tensor::random(1024, 1024, TensorDType::Float32);
     let mut mul_right_cpu = Tensor::random(1024, 1024, TensorDType::Float32);
     let mut mul_target_cpu = Tensor::random(1024, 1024, TensorDType::Float32);
 
+    let mut mul_left1 = Tensor::random(4096, 11000, TensorDType::Float16);
+    let mut mul_right1 = Tensor::random(1, 11000, TensorDType::Float16);
+    let mut mul_target1 = Tensor::zeros(4096, 1, TensorDType::Float16);
+    mul_left1.to_gpu_inplace(&cl).unwrap();
+    mul_right1.to_gpu_inplace(&cl).unwrap();
+    mul_target1.to_gpu_inplace(&cl).unwrap();
+
+    c.bench_function(
+        "4096x11000 to 1x11000 matrix multiplication transposed on OpenCL",
+        |b| {
+            b.iter(|| {
+                mul_target1
+                    .matrix_mul_inplace_transposed(black_box(&mul_left1), black_box(&mul_right1));
+                mul_target1.finish();
+            })
+        },
+    );
+
     c.bench_function(
         "1024x1024 matrix multiplication transposed on OpenCL",
         |b| {
@@ -43,24 +61,24 @@ pub fn opencl_benchmarks(c: &mut Criterion) {
 
     c.bench_function("1x1 matrix from CPU to OpenCL device and back", |b| {
         b.iter(|| {
-            let _ = orig1.to_gpu(&cl).unwrap();
-            let _ = orig1.to_cpu();
+            let _ = orig1.to_gpu_inplace(&cl).unwrap();
+            let _ = orig1.to_cpu_inplace();
             orig1.finish();
         })
     });
 
     c.bench_function("1024x1024 matrix from CPU to OpenCL device and back", |b| {
         b.iter(|| {
-            let _ = orig16.to_gpu(&cl).unwrap();
-            let _ = orig16.to_cpu();
+            let _ = orig16.to_gpu_inplace(&cl).unwrap();
+            let _ = orig16.to_cpu_inplace();
             orig16.finish();
         })
     });
 
     c.bench_function("4096x4096 matrix from CPU to OpenCL device and back", |b| {
         b.iter(|| {
-            let _ = orig32.to_gpu(&cl).unwrap();
-            let _ = orig32.to_cpu();
+            let _ = orig32.to_gpu_inplace(&cl).unwrap();
+            let _ = orig32.to_cpu_inplace();
             orig32.finish();
         })
     });
diff --git a/src/tensor.rs b/src/tensor.rs
index a65d69b..740e60a 100644
--- a/src/tensor.rs
+++ b/src/tensor.rs
@@ -1,3 +1,22 @@
+/*
+ *
+ * Tensors for RLLaMA
+ *
+ * This is not a general Tensor library; but it has just enough to run the transformers in LLaMA
+ * model.
+ *
+ *
+ * The main structure you work with is Tensor, which is a 2D matrix. All Tensors here are 2D
+ * matrices with no flexibility.
+ *
+ * Tensors can be 16-bit, 32-bit and they can be on OpenCL or on the CPU.
+ *
+ * Operations have this naming convention:
+ *
+ *   If it's "to_XXX", then it returns a new tensor in the specified format.
+ *   If it's "XXX_inplace", then it has a &mut self and it modifies the tensor in place.
+ */
+
 #[cfg(feature = "opencl")]
 use crate::tensor_opencl_support::{OpenCL, OpenCLError, OpenCLEvent, OpenCLTensor};
 use crate::unpickler;
@@ -587,7 +606,7 @@ impl Tensor {
             // TODO: do not create a CPU-side copy
             let result = unsafe { Tensor::uninitialized(self.rows, self.cols, self.dtype) };
             let mut result = result.to_f16();
-            result.to_gpu(&cl).unwrap();
+            result.to_gpu_inplace(&cl).unwrap();
             result.with_opencl_data_mut(|tgt_tensor| {
                 tgt_tensor.copy_inplace(self_tensor).unwrap();
                 other.with_opencl_data(|other_tensor| {
@@ -689,7 +708,7 @@ impl Tensor {
             // TODO: don't generate a CPU-side copy, create the result directly on OpenCL side
             let mut result = unsafe { Tensor::uninitialized(self.rows, self.cols, self.dtype) };
             result = result.to_f16();
-            result.to_gpu(&cl).unwrap();
+            result.to_gpu_inplace(&cl).unwrap();
             result.with_opencl_data_mut(|tgt_tensor| {
                 tgt_tensor.copy_inplace(src_tensor).unwrap();
                 tgt_tensor.silu_inplace().unwrap();
@@ -733,7 +752,7 @@ impl Tensor {
             // TODO: don't generate a CPU-side copy, create the result directly on OpenCL side
             let mut result = unsafe { Tensor::uninitialized(self.cols, self.rows, self.dtype) };
             result = result.to_f16();
-            result.to_gpu(&cl).unwrap();
+            result.to_gpu_inplace(&cl).unwrap();
             result.with_opencl_data_mut(|tgt_tensor| {
                 tgt_tensor.transpose_from(src_tensor).unwrap();
             });
@@ -817,7 +836,7 @@ impl Tensor {
         #[cfg(feature = "opencl")]
         if self.is_on_gpu() {
             let od = self.opencl_data.write().unwrap();
-            result.to_gpu(&od.as_ref().unwrap().cl()).unwrap();
+            result.to_gpu_inplace(&od.as_ref().unwrap().cl()).unwrap();
         }
 
         result.matrix_mul_inplace_transposed(self, other);
@@ -1427,14 +1446,14 @@ impl Tensor {
     ///
     /// The tensor is moved asynchronously.
     #[cfg(feature = "opencl")]
-    pub fn to_gpu(&mut self, cl: &OpenCL) -> Result<(), TensorError> {
+    pub fn to_gpu_inplace(&mut self, cl: &OpenCL) -> Result<(), TensorError> {
         self.process_waiting_for_data_mut();
         let mut od = self.opencl_data.write().unwrap();
         if od.is_some() {
             return Ok(());
         }
         if self.dtype != TensorDType::Float16 {
-            panic!("to_gpu: Only float16 tensors are supported on the GPU");
+            panic!("to_gpu_inplace: Only float16 tensors are supported on the GPU");
         }
         let cl_tensor = cl.data_u16_to_gpu(
             self.data as *const u16,
@@ -1481,7 +1500,7 @@ impl Tensor {
     /// Sends a tensor from the GPU to the CPU. This is a no-op if the tensor is already on the
     /// CPU.
     #[cfg(feature = "opencl")]
-    pub fn to_cpu(&mut self) -> Result<(), TensorError> {
+    pub fn to_cpu_inplace(&mut self) -> Result<(), TensorError> {
         self.process_waiting_for_data_mut();
         let mut od = self.opencl_data.write().unwrap();
         if od.is_none() {
@@ -1489,7 +1508,7 @@ impl Tensor {
         }
         let data = unsafe { std::alloc::alloc(self.layout) };
         if data.is_null() {
-            panic!("to_cpu: Failed to allocate tensor");
+            panic!("to_cpu_inplace: Failed to allocate tensor");
         }
         let ev = od.as_mut().unwrap().data_u16_from_gpu(data as *mut u16)?;
         self.data = data as *mut u16 as *mut u8;
@@ -2162,12 +2181,12 @@ mod tests {
         let mut b2 = b.to_f16();
         let mut c = Tensor::random(512, 768, TensorDType::Float32);
         let mut c2 = Tensor::zeros(512, 768, TensorDType::Float32).to_f16();
-        a2.to_gpu(&cl).unwrap();
-        b2.to_gpu(&cl).unwrap();
-        c2.to_gpu(&cl).unwrap();
+        a2.to_gpu_inplace(&cl).unwrap();
+        b2.to_gpu_inplace(&cl).unwrap();
+        c2.to_gpu_inplace(&cl).unwrap();
         c.matrix_mul_inplace_transposed(&a, &b);
         c2.matrix_mul_inplace_transposed(&a2, &b2);
-        c2.to_cpu().unwrap();
+        c2.to_cpu_inplace().unwrap();
 
         assert_eq!(c.rows(), c2.rows());
         assert_eq!(c.cols(), c2.cols());
@@ -2189,12 +2208,12 @@ mod tests {
         let mut b2 = b.to_f16();
         let mut c = Tensor::random(1024, 1024, TensorDType::Float32);
         let mut c2 = Tensor::zeros(1024, 1024, TensorDType::Float32).to_f16();
-        a2.to_gpu(&cl).unwrap();
-        b2.to_gpu(&cl).unwrap();
-        c2.to_gpu(&cl).unwrap();
+        a2.to_gpu_inplace(&cl).unwrap();
+        b2.to_gpu_inplace(&cl).unwrap();
+        c2.to_gpu_inplace(&cl).unwrap();
         c.matrix_mul_inplace_transposed(&a, &b);
         c2.matrix_mul_inplace_transposed(&a2, &b2);
-        c2.to_cpu().unwrap();
+        c2.to_cpu_inplace().unwrap();
 
         assert_eq!(c.rows(), c2.rows());
         assert_eq!(c.cols(), c2.cols());
@@ -2218,11 +2237,11 @@ mod tests {
             let mat1 = Tensor::random(a, b, TensorDType::Float16);
             let mat2 = mat1.clone();
             let mut mat2 = mat2.to_f16();
-            mat2.to_gpu(&cl).unwrap();
+            mat2.to_gpu_inplace(&cl).unwrap();
 
             let mat1_result = mat1.silu();
             let mut mat2_result = mat2.silu();
-            mat2_result.to_cpu().unwrap();
+            mat2_result.to_cpu_inplace().unwrap();
 
             assert_eq!(mat1_result.rows(), mat2_result.rows());
             assert_eq!(mat1_result.cols(), mat2_result.cols());
@@ -2253,12 +2272,12 @@ mod tests {
 
             let mut mat1_gpu = mat1.to_f16();
             let mut mat2_gpu = mat2.to_f16();
-            mat1_gpu.to_gpu(&cl).unwrap();
-            mat2_gpu.to_gpu(&cl).unwrap();
+            mat1_gpu.to_gpu_inplace(&cl).unwrap();
+            mat2_gpu.to_gpu_inplace(&cl).unwrap();
 
             let result1 = mat1.hadamard_product(&mat2);
             let mut result2 = mat1_gpu.hadamard_product(&mat2_gpu);
-            result2.to_cpu().unwrap();
+            result2.to_cpu_inplace().unwrap();
 
             assert_eq!(result1.rows(), result2.rows());
             assert_eq!(result1.cols(), result2.cols());
@@ -2285,11 +2304,11 @@ mod tests {
             let b = rng.gen_range(1..=100);
             let mat1 = Tensor::random(a, b, TensorDType::Float16);
             let mut mat1_gpu = mat1.to_f16();
-            mat1_gpu.to_gpu(&cl).unwrap();
+            mat1_gpu.to_gpu_inplace(&cl).unwrap();
 
             let mat1_transposed = mat1.transpose();
             let mut mat1_gpu_transposed = mat1_gpu.transpose();
-            mat1_gpu_transposed.to_cpu().unwrap();
+            mat1_gpu_transposed.to_cpu_inplace().unwrap();
 
             assert_eq!(mat1_transposed.rows(), mat1_gpu_transposed.rows());
             assert_eq!(mat1_transposed.cols(), mat1_gpu_transposed.cols());
@@ -2323,9 +2342,52 @@ mod tests {
             let mut mat1_gpu = mat1.clone();
             let mut mat2_gpu = mat2.clone();
             let mut mat3_gpu = mat3.clone();
-            mat1_gpu.to_gpu(&cl).unwrap();
-            mat2_gpu.to_gpu(&cl).unwrap();
-            mat3_gpu.to_gpu(&cl).unwrap();
+            mat1_gpu.to_gpu_inplace(&cl).unwrap();
+            mat2_gpu.to_gpu_inplace(&cl).unwrap();
+            mat3_gpu.to_gpu_inplace(&cl).unwrap();
+
+            let mat1 = mat1.to_f32();
+            let mat2 = mat2.to_f32();
+            let mut mat3 = mat3.to_f32();
+
+            mat3.matrix_mul_inplace_transposed(&mat1, &mat2);
+            mat3_gpu.matrix_mul_inplace_transposed(&mat1_gpu, &mat2_gpu);
+            mat3_gpu.to_cpu_inplace().unwrap();
+
+            assert_eq!(mat3.rows(), mat3_gpu.rows());
+            assert_eq!(mat3.cols(), mat3_gpu.cols());
+
+            for row in 0..mat3.rows {
+                for col in 0..mat3.cols {
+                    assert_relative_eq!(
+                        mat3.get_f32(row, col),
+                        mat3_gpu.get_f32(row, col),
+                        epsilon = 1e-2,
+                    );
+                }
+            }
+        }
+    }
+
+    #[cfg(feature = "opencl")]
+    #[test]
+    fn gpu_matrix_mul_vector_transposed_is_close_to_cpu_matrix_mul_vector_transposed() {
+        let cl = OpenCL::new(false, 0).unwrap();
+        let mut rng = rand::thread_rng();
+
+        for _trial in 0..300 {
+            let a = rng.gen_range(1..=300);
+            let b = rng.gen_range(1..=300);
+
+            let mat1 = Tensor::random(a, b, TensorDType::Float16);
+            let mat2 = Tensor::random(1, b, TensorDType::Float16);
+            let mat3 = Tensor::random(a, 1, TensorDType::Float16);
+            let mut mat1_gpu = mat1.clone();
+            let mut mat2_gpu = mat2.clone();
+            let mut mat3_gpu = mat3.clone();
+            mat1_gpu.to_gpu_inplace(&cl).unwrap();
+            mat2_gpu.to_gpu_inplace(&cl).unwrap();
+            mat3_gpu.to_gpu_inplace(&cl).unwrap();
 
             let mat1 = mat1.to_f32();
             let mat2 = mat2.to_f32();
@@ -2333,7 +2395,7 @@ mod tests {
 
             mat3.matrix_mul_inplace_transposed(&mat1, &mat2);
             mat3_gpu.matrix_mul_inplace_transposed(&mat1_gpu, &mat2_gpu);
-            mat3_gpu.to_cpu().unwrap();
+            mat3_gpu.to_cpu_inplace().unwrap();
 
             assert_eq!(mat3.rows(), mat3_gpu.rows());
             assert_eq!(mat3.cols(), mat3_gpu.cols());
diff --git a/src/tensor_opencl_support.rs b/src/tensor_opencl_support.rs
index aa81f05..c540ee7 100644
--- a/src/tensor_opencl_support.rs
+++ b/src/tensor_opencl_support.rs
@@ -314,12 +314,19 @@ impl OpenCLTensor {
             );
         }
 
-        // Clear out the target memory
+        // Clear out the target memory.
         unsafe { self.buf.cmd().fill(0u16, None).block(false).enq()? };
 
         let prg = self.cl.programs.write().unwrap();
 
-        let prg = if self.cl.is_cpu_device {
+        // 0 = CPU optimized
+        // 1 = GPU optimized
+        // 2 = GPU optimized vector multiply (other.rows == 1)
+        const CPU: u8 = 0;
+        const GPU: u8 = 1;
+        let strategy: u8 = if self.cl.is_cpu_device { CPU } else { GPU };
+
+        let prg = if strategy == CPU {
             &prg.matrix_mul_transposed_f16_cpu_optimized
         } else {
             &prg.matrix_mul_transposed_f16
@@ -347,14 +354,14 @@ impl OpenCLTensor {
         };
 
         unsafe {
-            if self.cl.is_cpu_device {
+            if strategy == CPU {
                 let b = prg
                     .cmd()
                     .queue(&self.queue)
                     .global_work_size([self.cols as usize, self.rows as usize])
                     .enew(&mut event);
                 b.enq()?;
-            } else {
+            } else if strategy == GPU {
                 let b = prg
                     .cmd()
                     .queue(&self.queue)
@@ -362,6 +369,13 @@ impl OpenCLTensor {
                     .local_work_size([16, 16])
                     .enew(&mut event);
                 b.enq()?;
+            } else {
+                let b = prg
+                    .cmd()
+                    .queue(&self.queue)
+                    .global_work_size([self.cols as usize, self.rows as usize])
+                    .enew(&mut event);
+                b.enq()?;
             }
         }
         self.last_event = Some(event.clone());
diff --git a/src/transformer.rs b/src/transformer.rs
index 4d158b8..a20da45 100644
--- a/src/transformer.rs
+++ b/src/transformer.rs
@@ -407,9 +407,9 @@ impl FeedForward {
                 w2 = w2.to_f16();
                 w3 = w3.to_f16();
                 let ds = data_settings.clone();
-                w1.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
-                w2.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
-                w3.to_gpu(&ds.cl.unwrap()).unwrap();
+                w1.to_gpu_inplace(&ds.cl.as_ref().unwrap().clone()).unwrap();
+                w2.to_gpu_inplace(&ds.cl.as_ref().unwrap().clone()).unwrap();
+                w3.to_gpu_inplace(&ds.cl.unwrap()).unwrap();
             }
         }
         #[cfg(not(feature = "opencl"))]
@@ -435,7 +435,8 @@ impl FeedForward {
             x_was_on_cpu = x.is_on_cpu();
             if self.data_settings.use_opencl_for_feedforward {
                 *x = x.to_f16();
-                x.to_gpu(self.data_settings.cl.as_ref().unwrap()).unwrap();
+                x.to_gpu_inplace(self.data_settings.cl.as_ref().unwrap())
+                    .unwrap();
             }
         }
         let (w1_out, w3_out) = rayon::join(
@@ -457,7 +458,7 @@ impl FeedForward {
         {
             let mut result = self.w2.matrix_mul_transposed(&w1w3_out);
             if x_was_on_cpu {
-                result.to_cpu().unwrap();
+                result.to_cpu_inplace().unwrap();
                 result
             } else {
                 result
@@ -510,10 +511,10 @@ impl Attention {
                 wv = wv.to_f16();
                 wo = wo.to_f16();
                 let ds = data_settings.clone();
-                wq.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
-                wk.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
-                wv.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
-                wo.to_gpu(&ds.cl.unwrap()).unwrap();
+                wq.to_gpu_inplace(&ds.cl.as_ref().unwrap().clone()).unwrap();
+                wk.to_gpu_inplace(&ds.cl.as_ref().unwrap().clone()).unwrap();
+                wv.to_gpu_inplace(&ds.cl.as_ref().unwrap().clone()).unwrap();
+                wo.to_gpu_inplace(&ds.cl.unwrap()).unwrap();
             }
         }
         #[cfg(not(feature = "opencl"))]
@@ -550,7 +551,8 @@ impl Attention {
             x_was_on_cpu = x.is_on_cpu();
             if self.data_settings.use_opencl_for_attention {
                 *x = x.to_f16();
-                x.to_gpu(self.data_settings.cl.as_ref().unwrap()).unwrap();
+                x.to_gpu_inplace(self.data_settings.cl.as_ref().unwrap())
+                    .unwrap();
             }
         }
 
@@ -560,9 +562,9 @@ impl Attention {
             let mut xq_out = x.matrix_mul_transposed(&self.wq);
             let mut xk_out = x.matrix_mul_transposed(&self.wk);
             let mut xv_out = x.matrix_mul_transposed(&self.wv);
-            xq_out.to_cpu().unwrap();
-            xk_out.to_cpu().unwrap();
-            xv_out.to_cpu().unwrap();
+            xq_out.to_cpu_inplace().unwrap();
+            xk_out.to_cpu_inplace().unwrap();
+            xv_out.to_cpu_inplace().unwrap();
             (xq_out.to_f32(), xk_out.to_f32(), xv_out.to_f32())
         };
 
@@ -673,10 +675,10 @@ impl Attention {
                         .to_f16();
                     if self.wo.is_on_gpu() {
                         xq_row
-                            .to_gpu(&self.data_settings.cl.as_ref().unwrap())
+                            .to_gpu_inplace(&self.data_settings.cl.as_ref().unwrap())
                             .unwrap();
                         let mut result = xq_row.matrix_mul_transposed(&self.wo);
-                        result.to_cpu().unwrap();
+                        result.to_cpu_inplace().unwrap();
                         result.to_f32()
                     } else {
                         xq_row.matrix_mul_transposed(&self.wo)