From 1c5ec042170328c2545a658d6d151144b939d9a3 Mon Sep 17 00:00:00 2001 From: Mikko Juola Date: Wed, 15 Mar 2023 01:50:00 -0700 Subject: [PATCH] Add a different kernel to be used when OpenCL device is a CPU. This is almost the same code I had before. It runs better on CPUs rather than GPUs. --- src/tensor_opencl_support.rs | 137 ++++++++++++++++++++++++++++------- 1 file changed, 111 insertions(+), 26 deletions(-) diff --git a/src/tensor_opencl_support.rs b/src/tensor_opencl_support.rs index 220c6f2..aa81f05 100644 --- a/src/tensor_opencl_support.rs +++ b/src/tensor_opencl_support.rs @@ -2,7 +2,10 @@ * OpenCL stuff to run (some) of the tensor operations. */ -use ocl::{Buffer, Context, Device, Event, Kernel, Platform, Program, Queue}; +use ocl::{ + enums::DeviceInfo, enums::DeviceInfoResult, Buffer, Context, Device, DeviceType, Event, Kernel, + Platform, Program, Queue, +}; use std::alloc::Layout; use std::sync::{Arc, RwLock}; use thiserror::Error; @@ -12,6 +15,8 @@ use thiserror::Error; struct Programs { matrix_mul_transposed_f16_program: Program, matrix_mul_transposed_f16: Kernel, + matrix_mul_transposed_f16_cpu_optimized_program: Program, + matrix_mul_transposed_f16_cpu_optimized: Kernel, silu_f16_program: Program, silu_f16: Kernel, hadamard_product_f16_program: Program, @@ -26,6 +31,7 @@ pub struct OpenCL { ctx: Context, queue: Queue, programs: Arc>, + is_cpu_device: bool, } #[derive(Debug)] @@ -104,12 +110,18 @@ impl OpenCL { .devices(devices[nth_device].1) .build()?; + let is_cpu_device = match devices[nth_device].1.info(DeviceInfo::Type)? { + DeviceInfoResult::Type(DeviceType::CPU) => true, + _ => false, + }; + let queue = Queue::new(&ctx, devices[nth_device].1, None)?; let programs = make_programs(&ctx, &queue)?; Ok(OpenCL { ctx: ctx, queue: queue, programs: Arc::new(RwLock::new(programs)), + is_cpu_device, }) } @@ -306,19 +318,21 @@ impl OpenCLTensor { unsafe { self.buf.cmd().fill(0u16, None).block(false).enq()? }; let prg = self.cl.programs.write().unwrap(); - prg.matrix_mul_transposed_f16.set_arg(0, self.buf.clone())?; - prg.matrix_mul_transposed_f16.set_arg(1, src.buf.clone())?; - prg.matrix_mul_transposed_f16 - .set_arg(2, other.buf.clone())?; - prg.matrix_mul_transposed_f16 - .set_arg(3, src.cols_capacity as i32)?; - prg.matrix_mul_transposed_f16 - .set_arg(4, other.cols_capacity as i32)?; - prg.matrix_mul_transposed_f16 - .set_arg(5, self.cols_capacity as i32)?; - prg.matrix_mul_transposed_f16.set_arg(6, self.rows as i32)?; - prg.matrix_mul_transposed_f16.set_arg(7, self.cols as i32)?; - prg.matrix_mul_transposed_f16.set_arg(8, src.cols as i32)?; + + let prg = if self.cl.is_cpu_device { + &prg.matrix_mul_transposed_f16_cpu_optimized + } else { + &prg.matrix_mul_transposed_f16 + }; + prg.set_arg(0, self.buf.clone())?; + prg.set_arg(1, src.buf.clone())?; + prg.set_arg(2, other.buf.clone())?; + prg.set_arg(3, src.cols_capacity as i32)?; + prg.set_arg(4, other.cols_capacity as i32)?; + prg.set_arg(5, self.cols_capacity as i32)?; + prg.set_arg(6, self.rows as i32)?; + prg.set_arg(7, self.cols as i32)?; + prg.set_arg(8, src.cols as i32)?; let mut event = Event::empty(); let rows16 = if self.rows % 16 == 0 { @@ -333,14 +347,22 @@ impl OpenCLTensor { }; unsafe { - let b = prg - .matrix_mul_transposed_f16 - .cmd() - .queue(&self.queue) - .global_work_size([cols16 as usize, rows16 as usize]) - .local_work_size([16, 16]) - .enew(&mut event); - b.enq()?; + if self.cl.is_cpu_device { + let b = prg + .cmd() + .queue(&self.queue) + .global_work_size([self.cols as usize, self.rows as usize]) + .enew(&mut event); + b.enq()?; + } else { + let b = prg + .cmd() + .queue(&self.queue) + .global_work_size([cols16 as usize, rows16 as usize]) + .local_work_size([16, 16]) + .enew(&mut event); + b.enq()?; + } } self.last_event = Some(event.clone()); Ok(OpenCLEvent { event }) @@ -376,6 +398,22 @@ fn make_programs(ctx: &Context, queue: &Queue) -> Result .arg(&0) .queue(queue.clone()) .build()?; + let matrix_mul_transposed_f16_cpu_optimized_program = + make_program_with_src(ctx, MATRIX_MUL_TRANSPOSED_F16_CPU_OPTIMIZED_SRC)?; + let matrix_mul_transposed_f16_cpu_optimized = Kernel::builder() + .program(&matrix_mul_transposed_f16_cpu_optimized_program) + .name("matrix_mul_transposed_f16_cpu_optimized") + .arg(None::<&Buffer>) + .arg(None::<&Buffer>) + .arg(None::<&Buffer>) + .arg(&0) + .arg(&0) + .arg(&0) + .arg(&0) + .arg(&0) + .arg(&0) + .queue(queue.clone()) + .build()?; let silu_f16_program = make_program_with_src(ctx, SILU_F16_SRC)?; let silu_f16 = Kernel::builder() .program(&silu_f16_program) @@ -407,6 +445,8 @@ fn make_programs(ctx: &Context, queue: &Queue) -> Result Ok(Programs { matrix_mul_transposed_f16_program, matrix_mul_transposed_f16, + matrix_mul_transposed_f16_cpu_optimized_program, + matrix_mul_transposed_f16_cpu_optimized, silu_f16_program, silu_f16, hadamard_product_f16_program, @@ -419,10 +459,6 @@ fn make_programs(ctx: &Context, queue: &Queue) -> Result const MATRIX_MUL_TRANSPOSED_F16_SRC: &str = r#" #pragma OPENCL EXTENSION cl_khr_fp16 : enable -/* - * TODO: need to read resources like https://cnugteren.github.io/tutorial/pages/page1.html to - * figure out how matrix multiply faster. - */ __kernel void matrix_mul_transposed_f16( __global half *tgt, __global const half *left, @@ -467,6 +503,55 @@ __kernel void matrix_mul_transposed_f16( } "#; +const MATRIX_MUL_TRANSPOSED_F16_CPU_OPTIMIZED_SRC: &str = r#" +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void matrix_mul_transposed_f16_cpu_optimized( + __global half *tgt, + __global const half *left, + __global const half *right, + const int left_cols_capacity, + const int right_cols_capacity, + const int ncols_capacity, + const int nrows, + const int ncols, // size of target + const int shared_sz +) { + const int tgt_col = get_global_id(0); + const int tgt_row = get_global_id(1); + int col_iterations = shared_sz / 16; + if (shared_sz % 16 != 0) { + col_iterations = col_iterations + 1; + } + float16 sum = 0; + for (int col16 = 0; col16 < col_iterations; col16++) { + const float16 left8 = vload_half16((tgt_row * left_cols_capacity)/16 + col16, (__global const half*) left); + const float16 right8 = vload_half16((tgt_col * right_cols_capacity)/16 + col16, (__global const half*) right); + // hadamard product FMA add it to sum + // const float16 result8 = left8 * right8; + // sum += result8; + sum = fma(left8, right8, sum); + } + // Reduce as accurately as possible + float sum1 = sum.s0 + sum.s1; + float sum2 = sum.s2 + sum.s3; + float sum3 = sum.s4 + sum.s5; + float sum4 = sum.s6 + sum.s7; + float sum5 = sum.s8 + sum.s9; + float sum6 = sum.sa + sum.sb; + float sum7 = sum.sc + sum.sd; + float sum8 = sum.se + sum.sf; + float sum11 = sum1 + sum2; + float sum12 = sum3 + sum4; + float sum13 = sum5 + sum6; + float sum14 = sum7 + sum8; + float sum21 = sum11 + sum12; + float sum22 = sum13 + sum14; + float total = sum21 + sum22; + vstore_half(total, 0, (__global half*) &tgt[tgt_row * ncols_capacity + tgt_col]); +} +"#; + /// Computes SILU for every f16 value in the tensor const SILU_F16_SRC: &str = r#" #pragma OPENCL EXTENSION cl_khr_fp16 : enable