From 1a88482988466bdf805b88a1281f0dc92e1de047 Mon Sep 17 00:00:00 2001 From: Mikko Juola Date: Mon, 13 Mar 2023 12:33:21 -0700 Subject: [PATCH] Add some OpenCL bits. I wrote an OpenCL matrix_mul_inplace_transposed. It is much faster than my CPU implementation for GPU, and also quite a lot faster on CPU (OpenCL runs on CPU and GPU) than my own implementation. Basically it can destroy all of my crappy code. So I think I will be replacing some of my other operations with this stuff in near future. --- README.md | 12 +- src/benches/benchmark.rs | 34 ++++- src/rllama_main.rs | 3 +- src/tensor.rs | 136 ++++++++++++++++-- src/tensor_opencl_support.rs | 260 +++++++++++++++++++++++++++++++---- 5 files changed, 408 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 7945a44..f711b49 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,9 @@ PyTorch. Well almost, it doesn't unzip them automatically (see below). # How to run -You will need Rust. Make sure you can run `cargo` from a command line. +You will need Rust. Make sure you can run `cargo` from a command line. In +particular, this is using unstable features so you need nightly rust. Make sure +if you write `cargo --version` it is nightly. You will need to download LLaMA-7B weights. Refer to https://github.com/facebookresearch/llama/ @@ -58,4 +60,10 @@ settings. This is a hobby thing for me so don't expect updates or help. * Some other CPU implementations use quantization to reduce the size of weights -* Put some of the operations on the OpenCL GPU +* Put some of the operations on the OpenCL GPU/CPU. I've made some initial + OpenCL code but it is not used in the transformer loop yet. The CPU OpenCL + improves my own AVX2 code by like 100% and massively so on GPU although I am + also like 20x slower than equivalent operation on PyTorch on the same GPU. +* I've heard there is some thing called Tensor Cores on nVidia GPUs. Not + accessible with OpenCL. But might be accessible on Vulkan with a an + extension. diff --git a/src/benches/benchmark.rs b/src/benches/benchmark.rs index b6a68eb..aba10fc 100644 --- a/src/benches/benchmark.rs +++ b/src/benches/benchmark.rs @@ -13,11 +13,39 @@ pub fn opencl_benchmarks(c: &mut Criterion) { let mut orig32 = Tensor::random(4096, 4096, TensorDType::Float16); let cl = OpenCL::new(false, 0).unwrap(); + let mut mul_left = Tensor::random(1024, 1024, TensorDType::Float16); + mul_left.to_gpu(&cl).unwrap(); + let mut mul_right = Tensor::random(1024, 1024, TensorDType::Float16); + mul_right.to_gpu(&cl).unwrap(); + let mut mul_target = Tensor::zeros(1024, 1024, TensorDType::Float16); + mul_target.to_gpu(&cl).unwrap(); + + let mut mul_left_cpu = Tensor::random(1024, 1024, TensorDType::Float32); + let mut mul_right_cpu = Tensor::random(1024, 1024, TensorDType::Float32); + let mut mul_target_cpu = Tensor::random(1024, 1024, TensorDType::Float32); + + c.bench_function( + "1024x1024 matrix multiplication transposed on OpenCL", + |b| { + b.iter(|| { + mul_target + .matrix_mul_inplace_transposed(black_box(&mul_left), black_box(&mul_right)); + mul_target.finish(); + }) + }, + ); + + c.bench_function("1024x1024 matrix multiplication transposed on CPU", |b| { + b.iter(|| { + let _ = mul_target_cpu.matrix_mul_inplace_transposed(&mul_left_cpu, &mul_right_cpu); + }) + }); + c.bench_function("1x1 matrix from CPU to OpenCL device and back", |b| { b.iter(|| { let _ = orig1.to_gpu(&cl).unwrap(); let _ = orig1.to_cpu(); - orig1.process_waiting_for_data(); + orig1.finish(); }) }); @@ -25,7 +53,7 @@ pub fn opencl_benchmarks(c: &mut Criterion) { b.iter(|| { let _ = orig16.to_gpu(&cl).unwrap(); let _ = orig16.to_cpu(); - orig16.process_waiting_for_data(); + orig16.finish(); }) }); @@ -33,7 +61,7 @@ pub fn opencl_benchmarks(c: &mut Criterion) { b.iter(|| { let _ = orig32.to_gpu(&cl).unwrap(); let _ = orig32.to_cpu(); - orig32.process_waiting_for_data(); + orig32.finish(); }) }); } diff --git a/src/rllama_main.rs b/src/rllama_main.rs index c9863fd..e6c14c4 100644 --- a/src/rllama_main.rs +++ b/src/rllama_main.rs @@ -63,11 +63,12 @@ pub fn main() -> Result<(), Box> { } #[cfg(feature = "opencl")] - let opencl: Option = { + let _opencl: Option = { let opencl_device = cli.opencl_device.unwrap_or(0); match OpenCL::new(!be_quiet, opencl_device) { Err(openclerr) => { eprintln!("OpenCL error: {}", openclerr); + eprintln!("OpenCL is disabled because it failed to initialize."); None } Ok(opencl) => { diff --git a/src/tensor.rs b/src/tensor.rs index 3d7d9e9..de6306f 100644 --- a/src/tensor.rs +++ b/src/tensor.rs @@ -827,9 +827,45 @@ impl Tensor { } } + #[cfg(feature = "opencl")] + pub fn is_on_gpu(&self) -> bool { + if self.waiting_for_data.is_some() { + return false; + } + let od = self.opencl_data.read().unwrap(); + if od.is_some() { + return true; + } + false + } + + #[cfg(feature = "opencl")] + fn matrix_mul_inplace_transposed_gpu(&mut self, src: &Tensor, other: &Tensor) { + let mut self_od = self.opencl_data.write().unwrap(); + let src_od = src.opencl_data.read().unwrap(); + let other_od = other.opencl_data.read().unwrap(); + let self_od: &mut OpenCLTensor = self_od.as_mut().unwrap(); + let src_od: &OpenCLTensor = src_od.as_ref().unwrap(); + let other_od: &OpenCLTensor = other_od.as_ref().unwrap(); + + // TODO: if this fails, we panic. Think about if this is alright. I think for now it's + // alright. + self_od + .matrix_mul_inplace_transposed(src_od, other_od) + .unwrap(); + std::mem::drop(self_od); + std::mem::drop(src_od); + std::mem::drop(other_od); + } + /// Matrix multiplication done in-place, but the second matrix is transposed. /// With this, you can avoid using .transpose() on the second matrix. pub fn matrix_mul_inplace_transposed(&mut self, src: &Tensor, other: &Tensor) { + #[cfg(feature = "opencl")] + if self.is_on_gpu() && src.is_on_gpu() && other.is_on_gpu() { + self.matrix_mul_inplace_transposed_gpu(src, other); + return; + } self.assume_on_cpu(); src.assume_on_cpu(); other.assume_on_cpu(); @@ -1256,6 +1292,9 @@ impl Tensor { self.data as *const u16, self.layout, (self.rows * self.capacity_cols) as usize, + self.rows, + self.cols, + self.capacity_cols, )?; self.data = std::ptr::null_mut(); *od = Some(cl_tensor); @@ -1263,7 +1302,7 @@ impl Tensor { } #[cfg(feature = "opencl")] - pub fn process_waiting_for_data_mut(&mut self) { + fn process_waiting_for_data_mut(&mut self) { if let Some(ref wfd) = self.waiting_for_data { wfd.wait(); let mut od = self.opencl_data.write().unwrap(); @@ -1273,7 +1312,7 @@ impl Tensor { } #[cfg(feature = "opencl")] - pub fn process_waiting_for_data(&self) { + fn process_waiting_for_data(&self) { if let Some(ref wfd) = self.waiting_for_data { wfd.wait(); let mut od = self.opencl_data.write().unwrap(); @@ -1281,12 +1320,22 @@ impl Tensor { } } + /// Waits until asynchronous all operations on this tensor are done + #[cfg(feature = "opencl")] + pub fn finish(&mut self) { + self.process_waiting_for_data_mut(); + let mut od = self.opencl_data.write().unwrap(); + if od.is_some() { + od.as_mut().unwrap().wait_until_ready(); + } + } + /// Sends a tensor from the GPU to the CPU. This is a no-op if the tensor is already on the /// CPU. #[cfg(feature = "opencl")] pub fn to_cpu(&mut self) -> Result<(), TensorError> { self.process_waiting_for_data_mut(); - let od = self.opencl_data.read().unwrap(); + let mut od = self.opencl_data.write().unwrap(); if od.is_none() { return Ok(()); } @@ -1294,7 +1343,7 @@ impl Tensor { if data.is_null() { panic!("to_cpu: Failed to allocate tensor"); } - let ev = od.as_ref().unwrap().data_u16_from_gpu(data as *mut u16)?; + let ev = od.as_mut().unwrap().data_u16_from_gpu(data as *mut u16)?; self.data = data as *mut u16 as *mut u8; self.waiting_for_data = Some(ev); Ok(()) @@ -1678,10 +1727,10 @@ mod tests { let mut rng = rand::thread_rng(); for _ in 0..1000 { - let mut a: i64 = 0; - let mut b: i64 = 0; - let mut c: i64 = 0; - let mut d: i64 = 0; + let mut a: i64; + let mut b: i64; + let mut c: i64; + let d: i64; loop { a = rng.gen_range(8..64); b = rng.gen_range(8..64); @@ -1954,4 +2003,75 @@ mod tests { } } } + + #[cfg(feature = "opencl")] + #[test] + fn gpu_matrix_mul_transposed_is_close_to_cpu_matrix_mul_transposed_1024x1024() { + let cl = OpenCL::new(false, 0).unwrap(); + let a = Tensor::random(1024, 1024, TensorDType::Float32); + let b = Tensor::random(1024, 1024, TensorDType::Float32); + let mut a2 = a.to_f16(); + let mut b2 = b.to_f16(); + let mut c = Tensor::random(1024, 1024, TensorDType::Float32); + let mut c2 = Tensor::zeros(1024, 1024, TensorDType::Float32).to_f16(); + a2.to_gpu(&cl).unwrap(); + b2.to_gpu(&cl).unwrap(); + c2.to_gpu(&cl).unwrap(); + c.matrix_mul_inplace_transposed(&a, &b); + c2.matrix_mul_inplace_transposed(&a2, &b2); + c2.to_cpu().unwrap(); + + assert_eq!(c.rows(), c2.rows()); + assert_eq!(c.cols(), c2.cols()); + + for row in 0..c.rows { + for col in 0..c.cols { + assert_relative_eq!(c.get_f32(row, col), c2.get_f32(row, col), epsilon = 1e-1); + } + } + } + + #[cfg(feature = "opencl")] + #[test] + fn gpu_matrix_mul_transposed_is_close_to_cpu_matrix_mul_transposed() { + let cl = OpenCL::new(true, 1).unwrap(); + let mut rng = rand::thread_rng(); + + for _trial in 0..300 { + let a = rng.gen_range(1..=300); + let b = rng.gen_range(1..=300); + let c = rng.gen_range(1..=300); + + let mat1 = Tensor::random(a, b, TensorDType::Float16); + let mat2 = Tensor::random(c, b, TensorDType::Float16); + let mat3 = Tensor::random(a, c, TensorDType::Float16); + let mut mat1_gpu = mat1.clone(); + let mut mat2_gpu = mat2.clone(); + let mut mat3_gpu = mat3.clone(); + mat1_gpu.to_gpu(&cl).unwrap(); + mat2_gpu.to_gpu(&cl).unwrap(); + mat3_gpu.to_gpu(&cl).unwrap(); + + let mat1 = mat1.to_f32(); + let mat2 = mat2.to_f32(); + let mut mat3 = mat3.to_f32(); + + mat3.matrix_mul_inplace_transposed(&mat1, &mat2); + mat3_gpu.matrix_mul_inplace_transposed(&mat1_gpu, &mat2_gpu); + mat3_gpu.to_cpu().unwrap(); + + assert_eq!(mat3.rows(), mat3_gpu.rows()); + assert_eq!(mat3.cols(), mat3_gpu.cols()); + + for row in 0..mat3.rows { + for col in 0..mat3.cols { + assert_relative_eq!( + mat3.get_f32(row, col), + mat3_gpu.get_f32(row, col), + epsilon = 1e-2, + ); + } + } + } + } } diff --git a/src/tensor_opencl_support.rs b/src/tensor_opencl_support.rs index 25f66b1..7f571ab 100644 --- a/src/tensor_opencl_support.rs +++ b/src/tensor_opencl_support.rs @@ -2,23 +2,39 @@ * OpenCL stuff to run (some) of the tensor operations. */ -use ocl::{Buffer, Context, Device, Event, Platform, Queue}; +use ocl::{Buffer, Context, Device, Event, Kernel, Platform, Program, Queue}; use std::alloc::Layout; +use std::sync::{Arc, RwLock}; use thiserror::Error; #[derive(Debug)] +#[allow(dead_code)] +struct Programs { + matrix_mul_transposed_by_row_f16_program: Program, + matrix_mul_transposed_by_row_f16: Kernel, +} + +#[derive(Debug)] +#[allow(dead_code)] pub struct OpenCL { ctx: Context, queue: Queue, + programs: Arc>, } #[derive(Debug)] pub struct OpenCLTensor { - buf: Buffer, // really is f16 - write_event: Option, // if Some, the buffer is being written to - data: *const u16, // if non-null, is host pointer that should be freed + buf: Buffer, // really is f16 + initial_write_event: Option, + last_event: Option, + data: *const u16, data_layout: Layout, nitems: usize, + rows: i64, + cols: i64, + cols_capacity: i64, + queue: Queue, + programs: Arc>, } #[derive(Debug)] @@ -28,10 +44,15 @@ pub struct OpenCLEvent { impl Drop for OpenCLTensor { fn drop(&mut self) { + if self.initial_write_event.is_some() { + self.initial_write_event + .as_ref() + .unwrap() + .wait_for() + .unwrap(); + } + self.initial_write_event = None; if !self.data.is_null() { - if self.write_event.is_some() { - self.write_event.as_ref().unwrap().wait_for().unwrap(); - } unsafe { std::alloc::dealloc(self.data as *mut u8, self.data_layout); } @@ -59,9 +80,9 @@ impl OpenCL { if verbose { println!("Enumerating OpenCL devices:"); } - for (idx, (_, device)) in devices.iter().enumerate() { + for (idx, (_plat, device)) in devices.iter().enumerate() { if verbose { - println!("OpenCL {} device: {}", idx, device.name()?); + println!("OpenCL {} device: {}", idx, device.name()?,); } } if nth_device > devices.len() { @@ -78,8 +99,12 @@ impl OpenCL { .build()?; let queue = Queue::new(&ctx, devices[nth_device].1, None)?; - - Ok(OpenCL { ctx, queue }) + let programs = make_programs(&ctx, &queue)?; + Ok(OpenCL { + ctx: ctx, + queue: queue, + programs: Arc::new(RwLock::new(programs)), + }) } pub fn flush(&self) { @@ -91,6 +116,9 @@ impl OpenCL { data: *const u16, data_layout: Layout, nitems: usize, + rows: i64, + cols: i64, + cols_capacity: i64, ) -> Result { unsafe { let buf = Buffer::builder() @@ -106,10 +134,16 @@ impl OpenCL { .enq()?; Ok(OpenCLTensor { buf, - write_event: Some(event), + initial_write_event: Some(event), + last_event: None, data, data_layout, nitems, + rows, + cols, + cols_capacity, + queue: self.queue.clone(), + programs: self.programs.clone(), }) } } @@ -117,34 +151,96 @@ impl OpenCL { impl OpenCLTensor { pub fn wait_until_ready(&mut self) { - if self.write_event.is_none() { - return; + if self.last_event.is_some() { + self.last_event.as_ref().unwrap().wait_for().unwrap(); + self.last_event = None; + } + if self.initial_write_event.is_some() { + self.initial_write_event + .as_ref() + .unwrap() + .wait_for() + .unwrap(); + self.initial_write_event = None; } - self.write_event.as_ref().unwrap().wait_for().unwrap(); - self.write_event = None; if !self.data.is_null() { - if self.write_event.is_some() { - self.write_event.as_ref().unwrap().wait_for().unwrap(); - } unsafe { std::alloc::dealloc(self.data as *mut u8, self.data_layout); } + self.data = std::ptr::null(); } } - pub fn data_u16_from_gpu(&self, data: *mut u16) -> Result { + pub fn data_u16_from_gpu(&mut self, data: *mut u16) -> Result { unsafe { let mut event = Event::empty(); let data_slice: &mut [u16] = std::slice::from_raw_parts_mut(data, self.nitems); - self.buf + let b = self + .buf .cmd() .read(data_slice) .block(false) - .enew(&mut event) - .enq()?; + .enew(&mut event); + b.enq()?; + self.last_event = Some(event.clone()); return Ok(OpenCLEvent { event }); } } + + pub fn matrix_mul_inplace_transposed( + &mut self, + src: &OpenCLTensor, + other: &OpenCLTensor, + ) -> Result { + if src.cols != other.cols { + panic!( + "OpenCL matrix_mul_inplace_transposed: src.cols must equal other.cols: {}x{} vs {}x{}", + src.rows, src.cols, other.rows, other.cols + ); + } + if self.rows != src.rows || self.cols != other.rows { + panic!( + "OpenCL matrix_mul_inplace_transposed: self.rows must equal src.rows and self.cols must equal other.cols: {}x{} vs {}x{} vs {}x{}", + self.rows, self.cols, src.rows, src.cols, other.rows, other.cols + ); + } + + // Clear out the target memory + unsafe { self.buf.cmd().fill(0u16, None).block(false).enq()? }; + + let prg = self.programs.write().unwrap(); + prg.matrix_mul_transposed_by_row_f16 + .set_arg(0, self.buf.clone())?; + prg.matrix_mul_transposed_by_row_f16 + .set_arg(1, src.buf.clone())?; + prg.matrix_mul_transposed_by_row_f16 + .set_arg(2, other.buf.clone())?; + prg.matrix_mul_transposed_by_row_f16 + .set_arg(3, src.cols_capacity as i32)?; + prg.matrix_mul_transposed_by_row_f16 + .set_arg(4, other.cols_capacity as i32)?; + prg.matrix_mul_transposed_by_row_f16 + .set_arg(5, self.cols_capacity as i32)?; + prg.matrix_mul_transposed_by_row_f16 + .set_arg(6, self.rows as i32)?; + prg.matrix_mul_transposed_by_row_f16 + .set_arg(7, self.cols as i32)?; + prg.matrix_mul_transposed_by_row_f16 + .set_arg(8, src.cols as i32)?; + let mut event = Event::empty(); + + unsafe { + let b = prg + .matrix_mul_transposed_by_row_f16 + .cmd() + .queue(&self.queue) + .global_work_size([self.rows as usize, self.cols_capacity as usize]) + .enew(&mut event); + b.enq()?; + } + self.last_event = Some(event.clone()); + Ok(OpenCLEvent { event }) + } } impl OpenCLEvent { @@ -153,3 +249,121 @@ impl OpenCLEvent { self.event.wait_for().unwrap(); } } + +fn make_programs(ctx: &Context, queue: &Queue) -> Result { + let mut last_err: Option = None; + // There used to be more sources here but now it's just one. This can go through programs and + // accept first one that compiles + for src in &[MATRIX_MUL_TRANSPOSED_BY_ROW_F16_SRC] { + fn make_programs_with_src( + ctx: &Context, + queue: &Queue, + src: &str, + ) -> Result { + let program = Program::builder().src(src).build(&ctx)?; + let kernel = Kernel::builder() + .program(&program) + .name("matrix_mul_transposed_by_row_f16") + .arg(None::<&Buffer>) + .arg(None::<&Buffer>) + .arg(None::<&Buffer>) + .arg(&0) + .arg(&0) + .arg(&0) + .arg(&0) + .arg(&0) + .arg(&0) + .queue(queue.clone()) + .build()?; + Ok(Programs { + matrix_mul_transposed_by_row_f16_program: program, + matrix_mul_transposed_by_row_f16: kernel, + }) + } + match make_programs_with_src(ctx, queue, src) { + Err(e) => { + last_err = Some(e); + continue; + } + Ok(p) => return Ok(p), + } + } + if last_err.is_none() { + unreachable!(); + } + Err(last_err.unwrap()) +} + +const MATRIX_MUL_TRANSPOSED_BY_ROW_F16_SRC: &str = r#" +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +/* + * Matrix multiplication with a transposed second matrix, using 16-bit floats. + * + * One work unit per row. + * + * Assumes that each row in the matrices are zero-padded so that there's space for 32 bytes (or 16 + * halfs) of data and we don't need to care if our loops go over the bounds. + * + * Operations are done in float32. + * + * This thing is not very fast right now. I compared with PyTorch and this is like 20x slower. It + * is still much faster than CPU. Not sure PyTorch uses cuBlas but if we could get at least + * somewhere like 50% of that speed I would be happy. + * + * The OpenCL on CPU for Ryzen 3950X seems to easily beat my own AVX2 operations. + * + * TODO: need to read resources like https://cnugteren.github.io/tutorial/pages/page1.html to + * figure out how matrix multiply faster. + */ +__kernel void matrix_mul_transposed_by_row_f16( + __global half *tgt, + __global const half *left, + __global const half *right, + const int left_cols_capacity, + const int right_cols_capacity, + const int ncols_capacity, + const int nrows, + const int ncols, // size of target + const int shared_sz +) { + int col_iterations = shared_sz / 16; + if (shared_sz % 16 != 0) { + col_iterations = col_iterations + 1; + } + + const int tgt_row = get_global_id(0); + const int tgt_col = get_global_id(1); + + float16 sum = 0; + for (int col16 = 0; col16 < col_iterations; col16++) { + const float16 left8 = vload_half16((tgt_row * left_cols_capacity)/16 + col16, (__global const half*) left); + const float16 right8 = vload_half16((tgt_col * right_cols_capacity)/16 + col16, (__global const half*) right); + // hadamard product FMA add it to sum + // const float16 result8 = left8 * right8; + // sum += result8; + sum = fma(left8, right8, sum); + } + // Reduce as accurately as possible + float sum1 = sum.s0 + sum.s1; + float sum2 = sum.s2 + sum.s3; + float sum3 = sum.s4 + sum.s5; + float sum4 = sum.s6 + sum.s7; + float sum5 = sum.s8 + sum.s9; + float sum6 = sum.sa + sum.sb; + float sum7 = sum.sc + sum.sd; + float sum8 = sum.se + sum.sf; + + float sum11 = sum1 + sum2; + float sum12 = sum3 + sum4; + float sum13 = sum5 + sum6; + float sum14 = sum7 + sum8; + + float sum21 = sum11 + sum12; + float sum22 = sum13 + sum14; + + float total = sum21 + sum22; + + vstore_half(total, 0, (__global half*) &tgt[tgt_row * ncols_capacity + tgt_col]); +} +"#;