/* * OpenCL stuff to run (some) of the tensor operations. */ use ocl::{ enums::DeviceInfo, enums::DeviceInfoResult, Buffer, Context, Device, DeviceType, Event, Kernel, Platform, Program, Queue, }; use std::alloc::Layout; use std::sync::{Arc, RwLock}; use thiserror::Error; #[derive(Debug)] #[allow(dead_code)] struct Programs { matrix_mul_transposed_f16_program: Program, matrix_mul_transposed_f16: Kernel, matrix_mul_transposed_f16_cpu_optimized_program: Program, matrix_mul_transposed_f16_cpu_optimized: Kernel, silu_f16_program: Program, silu_f16: Kernel, hadamard_product_f16_program: Program, hadamard_product_f16: Kernel, transpose_f16_program: Program, transpose_f16: Kernel, } #[derive(Debug, Clone)] #[allow(dead_code)] pub struct OpenCL { ctx: Context, queue: Queue, programs: Arc>, is_cpu_device: bool, } #[derive(Debug)] pub struct OpenCLTensor { buf: Buffer, // really is f16 initial_write_event: Option, last_event: Option, data: *const u16, data_layout: Layout, nitems: usize, rows: i64, cols: i64, cols_capacity: i64, queue: Queue, cl: OpenCL, } #[derive(Debug)] pub struct OpenCLEvent { event: ocl::Event, } impl Drop for OpenCLTensor { fn drop(&mut self) { if self.initial_write_event.is_some() { self.initial_write_event .as_ref() .unwrap() .wait_for() .unwrap(); } self.initial_write_event = None; if !self.data.is_null() { unsafe { std::alloc::dealloc(self.data as *mut u8, self.data_layout); } } } } #[derive(Error, Debug)] pub enum OpenCLError { #[error("OpenCL error: {0}")] OpenCL(#[from] ocl::Error), #[error("Cannot select device")] OpenCLDeviceSelection, } impl OpenCL { pub fn new(verbose: bool, nth_device: usize) -> Result { let platforms = Platform::list(); let mut devices: Vec<(Platform, Device)> = Vec::new(); for platform in platforms { for device in Device::list_all(platform)? { devices.push((platform, device)); } } if verbose { println!("Enumerating OpenCL devices:"); } for (idx, (_plat, device)) in devices.iter().enumerate() { if verbose { println!("OpenCL {} device: {}", idx, device.name()?,); } } if nth_device > devices.len() { return Err(OpenCLError::OpenCLDeviceSelection); } if verbose { println!("---"); println!("Selected OpenCL device: {}", devices[nth_device].1.name()?); } let ctx = Context::builder() .platform(devices[nth_device].0) .devices(devices[nth_device].1) .build()?; let is_cpu_device = match devices[nth_device].1.info(DeviceInfo::Type)? { DeviceInfoResult::Type(DeviceType::CPU) => true, _ => false, }; let queue = Queue::new(&ctx, devices[nth_device].1, None)?; let programs = make_programs(&ctx, &queue)?; Ok(OpenCL { ctx: ctx, queue: queue, programs: Arc::new(RwLock::new(programs)), is_cpu_device, }) } pub fn flush(&self) { let _ = self.queue.flush(); } pub fn data_u16_to_gpu( &self, data: *const u16, data_layout: Layout, nitems: usize, rows: i64, cols: i64, cols_capacity: i64, ) -> Result { unsafe { let buf = Buffer::builder() .queue(self.queue.clone()) .len(nitems) .build()?; let mut event = Event::empty(); let data_slice: &[u16] = std::slice::from_raw_parts(data, nitems); buf.cmd() .write(data_slice) .block(false) .enew(&mut event) .enq()?; Ok(OpenCLTensor { buf, initial_write_event: Some(event), last_event: None, data, data_layout, nitems, rows, cols, cols_capacity, queue: self.queue.clone(), cl: self.clone(), }) } } } impl OpenCLTensor { pub fn cl(&self) -> OpenCL { self.cl.clone() } pub fn wait_until_ready(&mut self) { if self.last_event.is_some() { self.last_event.as_ref().unwrap().wait_for().unwrap(); self.last_event = None; } if self.initial_write_event.is_some() { self.initial_write_event .as_ref() .unwrap() .wait_for() .unwrap(); self.initial_write_event = None; } if !self.data.is_null() { unsafe { std::alloc::dealloc(self.data as *mut u8, self.data_layout); } self.data = std::ptr::null(); } } pub fn data_u16_from_gpu(&mut self, data: *mut u16) -> Result { unsafe { let mut event = Event::empty(); let data_slice: &mut [u16] = std::slice::from_raw_parts_mut(data, self.nitems); let b = self .buf .cmd() .read(data_slice) .block(false) .enew(&mut event); b.enq()?; self.last_event = Some(event.clone()); return Ok(OpenCLEvent { event }); } } /// Copies all values from another tensor pub fn copy_inplace(&mut self, other: &OpenCLTensor) -> Result { if other.rows != self.rows || other.cols != self.cols { panic!( "Cannot in-place copy tensors of different sizes: {}x{} <-- {}x{}", self.rows, self.cols, other.rows, other.cols ); } let mut event = Event::empty(); other .buf .cmd() .queue(&other.queue) .copy(&self.buf, None, None) .enew(&mut event) .enq()?; self.last_event = Some(event.clone()); Ok(OpenCLEvent { event }) } pub fn transpose_from(&mut self, other: &OpenCLTensor) -> Result { let prg = self.cl.programs.write().unwrap(); prg.transpose_f16.set_arg(0, self.buf.clone()).unwrap(); prg.transpose_f16.set_arg(1, other.buf.clone()).unwrap(); prg.transpose_f16 .set_arg(2, self.cols_capacity as i32) .unwrap(); prg.transpose_f16 .set_arg(3, other.cols_capacity as i32) .unwrap(); let mut event = Event::empty(); unsafe { let b = prg .transpose_f16 .cmd() .queue(&self.queue) .global_work_size([self.rows as usize, self.cols as usize]) .enew(&mut event); b.enq().unwrap(); } self.last_event = Some(event.clone()); Ok(OpenCLEvent { event }) } pub fn hadamard_product_inplace( &mut self, other: &OpenCLTensor, ) -> Result { let prg = self.cl.programs.write().unwrap(); prg.hadamard_product_f16.set_arg(0, self.buf.clone())?; prg.hadamard_product_f16.set_arg(1, other.buf.clone())?; prg.hadamard_product_f16 .set_arg(2, self.cols_capacity as i32)?; prg.hadamard_product_f16 .set_arg(3, other.cols_capacity as i32)?; let mut event = Event::empty(); unsafe { let b = prg .hadamard_product_f16 .cmd() .queue(&self.queue) .global_work_size([self.rows as usize, self.cols as usize]) .enew(&mut event); b.enq()?; } self.last_event = Some(event.clone()); Ok(OpenCLEvent { event }) } pub fn silu_inplace(&mut self) -> Result { let prg = self.cl.programs.write().unwrap(); prg.silu_f16.set_arg(0, self.buf.clone())?; prg.silu_f16.set_arg(1, self.cols_capacity as i32)?; let mut event = Event::empty(); unsafe { let b = prg .silu_f16 .cmd() .queue(&self.queue) .global_work_size([self.rows as usize, self.cols as usize]) .enew(&mut event); b.enq()?; } self.last_event = Some(event.clone()); Ok(OpenCLEvent { event }) } pub fn matrix_mul_inplace_transposed( &mut self, src: &OpenCLTensor, other: &OpenCLTensor, ) -> Result { if src.cols != other.cols { panic!( "OpenCL matrix_mul_inplace_transposed: src.cols must equal other.cols: {}x{} vs {}x{}", src.rows, src.cols, other.rows, other.cols ); } if self.rows != src.rows || self.cols != other.rows { panic!( "OpenCL matrix_mul_inplace_transposed: self.rows must equal src.rows and self.cols must equal other.cols: {}x{} vs {}x{} vs {}x{}", self.rows, self.cols, src.rows, src.cols, other.rows, other.cols ); } // Clear out the target memory. unsafe { self.buf.cmd().fill(0u16, None).block(false).enq()? }; let prg = self.cl.programs.write().unwrap(); // 0 = CPU optimized // 1 = GPU optimized // 2 = GPU optimized vector multiply (other.rows == 1) const CPU: u8 = 0; const GPU: u8 = 1; let strategy: u8 = if self.cl.is_cpu_device { CPU } else { GPU }; let prg = if strategy == CPU { &prg.matrix_mul_transposed_f16_cpu_optimized } else { &prg.matrix_mul_transposed_f16 }; prg.set_arg(0, self.buf.clone())?; prg.set_arg(1, src.buf.clone())?; prg.set_arg(2, other.buf.clone())?; prg.set_arg(3, src.cols_capacity as i32)?; prg.set_arg(4, other.cols_capacity as i32)?; prg.set_arg(5, self.cols_capacity as i32)?; prg.set_arg(6, self.rows as i32)?; prg.set_arg(7, self.cols as i32)?; prg.set_arg(8, src.cols as i32)?; let mut event = Event::empty(); let rows16 = if self.rows % 16 == 0 { self.rows } else { self.rows + 16 - (self.rows % 16) }; let cols16 = if self.cols % 16 == 0 { self.cols } else { self.cols + 16 - (self.cols % 16) }; unsafe { if strategy == CPU { let b = prg .cmd() .queue(&self.queue) .global_work_size([self.cols as usize, self.rows as usize]) .enew(&mut event); b.enq()?; } else if strategy == GPU { let b = prg .cmd() .queue(&self.queue) .global_work_size([cols16 as usize, rows16 as usize]) .local_work_size([16, 16]) .enew(&mut event); b.enq()?; } else { let b = prg .cmd() .queue(&self.queue) .global_work_size([self.cols as usize, self.rows as usize]) .enew(&mut event); b.enq()?; } } self.last_event = Some(event.clone()); Ok(OpenCLEvent { event }) } } impl OpenCLEvent { #[inline] pub fn wait(&self) { self.event.wait_for().unwrap(); } } fn make_programs(ctx: &Context, queue: &Queue) -> Result { fn make_program_with_src(ctx: &Context, src: &str) -> Result { let program = Program::builder().src(src).build(&ctx)?; Ok(program) } let matrix_mul_transposed_f16_program = make_program_with_src(ctx, MATRIX_MUL_TRANSPOSED_F16_SRC)?; let matrix_mul_transposed_f16 = Kernel::builder() .program(&matrix_mul_transposed_f16_program) .name("matrix_mul_transposed_f16") .arg(None::<&Buffer>) .arg(None::<&Buffer>) .arg(None::<&Buffer>) .arg(&0) .arg(&0) .arg(&0) .arg(&0) .arg(&0) .arg(&0) .queue(queue.clone()) .build()?; let matrix_mul_transposed_f16_cpu_optimized_program = make_program_with_src(ctx, MATRIX_MUL_TRANSPOSED_F16_CPU_OPTIMIZED_SRC)?; let matrix_mul_transposed_f16_cpu_optimized = Kernel::builder() .program(&matrix_mul_transposed_f16_cpu_optimized_program) .name("matrix_mul_transposed_f16_cpu_optimized") .arg(None::<&Buffer>) .arg(None::<&Buffer>) .arg(None::<&Buffer>) .arg(&0) .arg(&0) .arg(&0) .arg(&0) .arg(&0) .arg(&0) .queue(queue.clone()) .build()?; let silu_f16_program = make_program_with_src(ctx, SILU_F16_SRC)?; let silu_f16 = Kernel::builder() .program(&silu_f16_program) .name("silu_f16") .arg(None::<&Buffer>) .arg(&0) .queue(queue.clone()) .build()?; let hadamard_product_f16_program = make_program_with_src(ctx, HADAMARD_PRODUCT_F16_SRC)?; let hadamard_product_f16 = Kernel::builder() .program(&hadamard_product_f16_program) .name("hadamard_product_f16") .arg(None::<&Buffer>) .arg(None::<&Buffer>) .arg(&0) .arg(&0) .queue(queue.clone()) .build()?; let transpose_f16_program = make_program_with_src(ctx, TRANSPOSE_F16_SRC)?; let transpose_f16 = Kernel::builder() .program(&transpose_f16_program) .name("transpose_f16") .arg(None::<&Buffer>) .arg(None::<&Buffer>) .arg(&0) .arg(&0) .queue(queue.clone()) .build()?; Ok(Programs { matrix_mul_transposed_f16_program, matrix_mul_transposed_f16, matrix_mul_transposed_f16_cpu_optimized_program, matrix_mul_transposed_f16_cpu_optimized, silu_f16_program, silu_f16, hadamard_product_f16_program, hadamard_product_f16, transpose_f16_program, transpose_f16, }) } const MATRIX_MUL_TRANSPOSED_F16_SRC: &str = r#" #pragma OPENCL EXTENSION cl_khr_fp16 : enable __kernel void matrix_mul_transposed_f16( __global half *tgt, __global const half *left, __global const half *right, const int left_cols_capacity, const int right_cols_capacity, const int ncols_capacity, const int nrows, const int ncols, // size of target const int shared_sz ) { __local float lefttile[16][16]; __local float righttile[16][16]; int global_x = get_global_id(0); int global_y = get_global_id(1); int local_x = get_local_id(0); int local_y = get_local_id(1); int num_tiles = (shared_sz + 15) / 16; float sum = 0.0f; for (int t = 0; t < num_tiles; ++t) { if (global_y < nrows) { lefttile[local_y][local_x] = vload_half(global_y * left_cols_capacity + t * 16 + local_x, left); } else { lefttile[local_y][local_x] = 0.0f; } if (global_x < ncols) { righttile[local_y][local_x] = vload_half(global_x * right_cols_capacity + t * 16 + local_y, right); } else { righttile[local_y][local_x] = 0.0f; } barrier(CLK_LOCAL_MEM_FENCE); for (int k = 0; k < 16; ++k) { sum += lefttile[local_y][k] * righttile[k][local_x]; } barrier(CLK_LOCAL_MEM_FENCE); } if (global_x < ncols && global_y < nrows) { vstore_half(sum, global_y * ncols_capacity + global_x, (__global half*) tgt); } } "#; const MATRIX_MUL_TRANSPOSED_F16_CPU_OPTIMIZED_SRC: &str = r#" #pragma OPENCL EXTENSION cl_khr_fp16 : enable __kernel void matrix_mul_transposed_f16_cpu_optimized( __global half *tgt, __global const half *left, __global const half *right, const int left_cols_capacity, const int right_cols_capacity, const int ncols_capacity, const int nrows, const int ncols, // size of target const int shared_sz ) { const int tgt_col = get_global_id(0); const int tgt_row = get_global_id(1); int col_iterations = shared_sz / 16; if (shared_sz % 16 != 0) { col_iterations = col_iterations + 1; } float16 sum = 0; for (int col16 = 0; col16 < col_iterations; col16++) { const float16 left8 = vload_half16((tgt_row * left_cols_capacity)/16 + col16, (__global const half*) left); const float16 right8 = vload_half16((tgt_col * right_cols_capacity)/16 + col16, (__global const half*) right); // hadamard product FMA add it to sum // const float16 result8 = left8 * right8; // sum += result8; sum = fma(left8, right8, sum); } // Reduce as accurately as possible float sum1 = sum.s0 + sum.s1; float sum2 = sum.s2 + sum.s3; float sum3 = sum.s4 + sum.s5; float sum4 = sum.s6 + sum.s7; float sum5 = sum.s8 + sum.s9; float sum6 = sum.sa + sum.sb; float sum7 = sum.sc + sum.sd; float sum8 = sum.se + sum.sf; float sum11 = sum1 + sum2; float sum12 = sum3 + sum4; float sum13 = sum5 + sum6; float sum14 = sum7 + sum8; float sum21 = sum11 + sum12; float sum22 = sum13 + sum14; float total = sum21 + sum22; vstore_half(total, 0, (__global half*) &tgt[tgt_row * ncols_capacity + tgt_col]); } "#; /// Computes SILU for every f16 value in the tensor const SILU_F16_SRC: &str = r#" #pragma OPENCL EXTENSION cl_khr_fp16 : enable __kernel void silu_f16(__global half *tgt, const int ncols_capacity) { const int tgt_row = get_global_id(0); const int tgt_col = get_global_id(1); const float val = vload_half(tgt_row * ncols_capacity + tgt_col, (__global const half*) tgt); const float result = val * (1.0 / (1.0 + exp(-val))); vstore_half(result, tgt_row * ncols_capacity + tgt_col, (__global half*) tgt); } "#; /// Computes hadamard product of two identially sized tensors const HADAMARD_PRODUCT_F16_SRC: &str = r#" #pragma OPENCL EXTENSION cl_khr_fp16 : enable __kernel void hadamard_product_f16(__global half *tgt, __global const half *left, const int ncols_capacity, const int left_cols_capacity) { const int tgt_row = get_global_id(0); const int tgt_col = get_global_id(1); const float tgt_value = vload_half(tgt_row * ncols_capacity + tgt_col, (__global const half*) tgt); const float left_value = vload_half(tgt_row * left_cols_capacity + tgt_col, (__global const half*) left); const float result = tgt_value * left_value; vstore_half(result, tgt_row * ncols_capacity + tgt_col, (__global half*) tgt); } "#; /// Computes the transpose of a matrix const TRANSPOSE_F16_SRC: &str = r#" #pragma OPENCL EXTENSION cl_khr_fp16 : enable __kernel void transpose_f16(__global half *tgt, __global const half *left, const int ncols_capacity, const int left_cols_capacity) { const int tgt_row = get_global_id(0); const int tgt_col = get_global_id(1); const int src_row = tgt_col; const int src_col = tgt_row; const float val = vload_half(src_row * left_cols_capacity + src_col, (__global const half*) left); vstore_half(val, tgt_row * ncols_capacity + tgt_col, (__global half*) tgt); } "#;