Add a different kernel to be used when OpenCL device is a CPU.

This is almost the same code I had before. It runs better on CPUs rather
than GPUs.
master
Mikko Juola 3 years ago
parent 8c64313fec
commit 1c5ec04217

@ -2,7 +2,10 @@
* OpenCL stuff to run (some) of the tensor operations. * OpenCL stuff to run (some) of the tensor operations.
*/ */
use ocl::{Buffer, Context, Device, Event, Kernel, Platform, Program, Queue}; use ocl::{
enums::DeviceInfo, enums::DeviceInfoResult, Buffer, Context, Device, DeviceType, Event, Kernel,
Platform, Program, Queue,
};
use std::alloc::Layout; use std::alloc::Layout;
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
use thiserror::Error; use thiserror::Error;
@ -12,6 +15,8 @@ use thiserror::Error;
struct Programs { struct Programs {
matrix_mul_transposed_f16_program: Program, matrix_mul_transposed_f16_program: Program,
matrix_mul_transposed_f16: Kernel, matrix_mul_transposed_f16: Kernel,
matrix_mul_transposed_f16_cpu_optimized_program: Program,
matrix_mul_transposed_f16_cpu_optimized: Kernel,
silu_f16_program: Program, silu_f16_program: Program,
silu_f16: Kernel, silu_f16: Kernel,
hadamard_product_f16_program: Program, hadamard_product_f16_program: Program,
@ -26,6 +31,7 @@ pub struct OpenCL {
ctx: Context, ctx: Context,
queue: Queue, queue: Queue,
programs: Arc<RwLock<Programs>>, programs: Arc<RwLock<Programs>>,
is_cpu_device: bool,
} }
#[derive(Debug)] #[derive(Debug)]
@ -104,12 +110,18 @@ impl OpenCL {
.devices(devices[nth_device].1) .devices(devices[nth_device].1)
.build()?; .build()?;
let is_cpu_device = match devices[nth_device].1.info(DeviceInfo::Type)? {
DeviceInfoResult::Type(DeviceType::CPU) => true,
_ => false,
};
let queue = Queue::new(&ctx, devices[nth_device].1, None)?; let queue = Queue::new(&ctx, devices[nth_device].1, None)?;
let programs = make_programs(&ctx, &queue)?; let programs = make_programs(&ctx, &queue)?;
Ok(OpenCL { Ok(OpenCL {
ctx: ctx, ctx: ctx,
queue: queue, queue: queue,
programs: Arc::new(RwLock::new(programs)), programs: Arc::new(RwLock::new(programs)),
is_cpu_device,
}) })
} }
@ -306,19 +318,21 @@ impl OpenCLTensor {
unsafe { self.buf.cmd().fill(0u16, None).block(false).enq()? }; unsafe { self.buf.cmd().fill(0u16, None).block(false).enq()? };
let prg = self.cl.programs.write().unwrap(); let prg = self.cl.programs.write().unwrap();
prg.matrix_mul_transposed_f16.set_arg(0, self.buf.clone())?;
prg.matrix_mul_transposed_f16.set_arg(1, src.buf.clone())?; let prg = if self.cl.is_cpu_device {
prg.matrix_mul_transposed_f16 &prg.matrix_mul_transposed_f16_cpu_optimized
.set_arg(2, other.buf.clone())?; } else {
prg.matrix_mul_transposed_f16 &prg.matrix_mul_transposed_f16
.set_arg(3, src.cols_capacity as i32)?; };
prg.matrix_mul_transposed_f16 prg.set_arg(0, self.buf.clone())?;
.set_arg(4, other.cols_capacity as i32)?; prg.set_arg(1, src.buf.clone())?;
prg.matrix_mul_transposed_f16 prg.set_arg(2, other.buf.clone())?;
.set_arg(5, self.cols_capacity as i32)?; prg.set_arg(3, src.cols_capacity as i32)?;
prg.matrix_mul_transposed_f16.set_arg(6, self.rows as i32)?; prg.set_arg(4, other.cols_capacity as i32)?;
prg.matrix_mul_transposed_f16.set_arg(7, self.cols as i32)?; prg.set_arg(5, self.cols_capacity as i32)?;
prg.matrix_mul_transposed_f16.set_arg(8, src.cols as i32)?; prg.set_arg(6, self.rows as i32)?;
prg.set_arg(7, self.cols as i32)?;
prg.set_arg(8, src.cols as i32)?;
let mut event = Event::empty(); let mut event = Event::empty();
let rows16 = if self.rows % 16 == 0 { let rows16 = if self.rows % 16 == 0 {
@ -333,8 +347,15 @@ impl OpenCLTensor {
}; };
unsafe { unsafe {
if self.cl.is_cpu_device {
let b = prg
.cmd()
.queue(&self.queue)
.global_work_size([self.cols as usize, self.rows as usize])
.enew(&mut event);
b.enq()?;
} else {
let b = prg let b = prg
.matrix_mul_transposed_f16
.cmd() .cmd()
.queue(&self.queue) .queue(&self.queue)
.global_work_size([cols16 as usize, rows16 as usize]) .global_work_size([cols16 as usize, rows16 as usize])
@ -342,6 +363,7 @@ impl OpenCLTensor {
.enew(&mut event); .enew(&mut event);
b.enq()?; b.enq()?;
} }
}
self.last_event = Some(event.clone()); self.last_event = Some(event.clone());
Ok(OpenCLEvent { event }) Ok(OpenCLEvent { event })
} }
@ -376,6 +398,22 @@ fn make_programs(ctx: &Context, queue: &Queue) -> Result<Programs, OpenCLError>
.arg(&0) .arg(&0)
.queue(queue.clone()) .queue(queue.clone())
.build()?; .build()?;
let matrix_mul_transposed_f16_cpu_optimized_program =
make_program_with_src(ctx, MATRIX_MUL_TRANSPOSED_F16_CPU_OPTIMIZED_SRC)?;
let matrix_mul_transposed_f16_cpu_optimized = Kernel::builder()
.program(&matrix_mul_transposed_f16_cpu_optimized_program)
.name("matrix_mul_transposed_f16_cpu_optimized")
.arg(None::<&Buffer<u16>>)
.arg(None::<&Buffer<u16>>)
.arg(None::<&Buffer<u16>>)
.arg(&0)
.arg(&0)
.arg(&0)
.arg(&0)
.arg(&0)
.arg(&0)
.queue(queue.clone())
.build()?;
let silu_f16_program = make_program_with_src(ctx, SILU_F16_SRC)?; let silu_f16_program = make_program_with_src(ctx, SILU_F16_SRC)?;
let silu_f16 = Kernel::builder() let silu_f16 = Kernel::builder()
.program(&silu_f16_program) .program(&silu_f16_program)
@ -407,6 +445,8 @@ fn make_programs(ctx: &Context, queue: &Queue) -> Result<Programs, OpenCLError>
Ok(Programs { Ok(Programs {
matrix_mul_transposed_f16_program, matrix_mul_transposed_f16_program,
matrix_mul_transposed_f16, matrix_mul_transposed_f16,
matrix_mul_transposed_f16_cpu_optimized_program,
matrix_mul_transposed_f16_cpu_optimized,
silu_f16_program, silu_f16_program,
silu_f16, silu_f16,
hadamard_product_f16_program, hadamard_product_f16_program,
@ -419,10 +459,6 @@ fn make_programs(ctx: &Context, queue: &Queue) -> Result<Programs, OpenCLError>
const MATRIX_MUL_TRANSPOSED_F16_SRC: &str = r#" const MATRIX_MUL_TRANSPOSED_F16_SRC: &str = r#"
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
/*
* TODO: need to read resources like https://cnugteren.github.io/tutorial/pages/page1.html to
* figure out how matrix multiply faster.
*/
__kernel void matrix_mul_transposed_f16( __kernel void matrix_mul_transposed_f16(
__global half *tgt, __global half *tgt,
__global const half *left, __global const half *left,
@ -467,6 +503,55 @@ __kernel void matrix_mul_transposed_f16(
} }
"#; "#;
const MATRIX_MUL_TRANSPOSED_F16_CPU_OPTIMIZED_SRC: &str = r#"
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void matrix_mul_transposed_f16_cpu_optimized(
__global half *tgt,
__global const half *left,
__global const half *right,
const int left_cols_capacity,
const int right_cols_capacity,
const int ncols_capacity,
const int nrows,
const int ncols, // size of target
const int shared_sz
) {
const int tgt_col = get_global_id(0);
const int tgt_row = get_global_id(1);
int col_iterations = shared_sz / 16;
if (shared_sz % 16 != 0) {
col_iterations = col_iterations + 1;
}
float16 sum = 0;
for (int col16 = 0; col16 < col_iterations; col16++) {
const float16 left8 = vload_half16((tgt_row * left_cols_capacity)/16 + col16, (__global const half*) left);
const float16 right8 = vload_half16((tgt_col * right_cols_capacity)/16 + col16, (__global const half*) right);
// hadamard product FMA add it to sum
// const float16 result8 = left8 * right8;
// sum += result8;
sum = fma(left8, right8, sum);
}
// Reduce as accurately as possible
float sum1 = sum.s0 + sum.s1;
float sum2 = sum.s2 + sum.s3;
float sum3 = sum.s4 + sum.s5;
float sum4 = sum.s6 + sum.s7;
float sum5 = sum.s8 + sum.s9;
float sum6 = sum.sa + sum.sb;
float sum7 = sum.sc + sum.sd;
float sum8 = sum.se + sum.sf;
float sum11 = sum1 + sum2;
float sum12 = sum3 + sum4;
float sum13 = sum5 + sum6;
float sum14 = sum7 + sum8;
float sum21 = sum11 + sum12;
float sum22 = sum13 + sum14;
float total = sum21 + sum22;
vstore_half(total, 0, (__global half*) &tgt[tgt_row * ncols_capacity + tgt_col]);
}
"#;
/// Computes SILU for every f16 value in the tensor /// Computes SILU for every f16 value in the tensor
const SILU_F16_SRC: &str = r#" const SILU_F16_SRC: &str = r#"
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable

Loading…
Cancel
Save