|
|
|
|
@ -2,7 +2,10 @@
|
|
|
|
|
* OpenCL stuff to run (some) of the tensor operations.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
use ocl::{Buffer, Context, Device, Event, Kernel, Platform, Program, Queue};
|
|
|
|
|
use ocl::{
|
|
|
|
|
enums::DeviceInfo, enums::DeviceInfoResult, Buffer, Context, Device, DeviceType, Event, Kernel,
|
|
|
|
|
Platform, Program, Queue,
|
|
|
|
|
};
|
|
|
|
|
use std::alloc::Layout;
|
|
|
|
|
use std::sync::{Arc, RwLock};
|
|
|
|
|
use thiserror::Error;
|
|
|
|
|
@ -12,6 +15,8 @@ use thiserror::Error;
|
|
|
|
|
struct Programs {
|
|
|
|
|
matrix_mul_transposed_f16_program: Program,
|
|
|
|
|
matrix_mul_transposed_f16: Kernel,
|
|
|
|
|
matrix_mul_transposed_f16_cpu_optimized_program: Program,
|
|
|
|
|
matrix_mul_transposed_f16_cpu_optimized: Kernel,
|
|
|
|
|
silu_f16_program: Program,
|
|
|
|
|
silu_f16: Kernel,
|
|
|
|
|
hadamard_product_f16_program: Program,
|
|
|
|
|
@ -26,6 +31,7 @@ pub struct OpenCL {
|
|
|
|
|
ctx: Context,
|
|
|
|
|
queue: Queue,
|
|
|
|
|
programs: Arc<RwLock<Programs>>,
|
|
|
|
|
is_cpu_device: bool,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug)]
|
|
|
|
|
@ -104,12 +110,18 @@ impl OpenCL {
|
|
|
|
|
.devices(devices[nth_device].1)
|
|
|
|
|
.build()?;
|
|
|
|
|
|
|
|
|
|
let is_cpu_device = match devices[nth_device].1.info(DeviceInfo::Type)? {
|
|
|
|
|
DeviceInfoResult::Type(DeviceType::CPU) => true,
|
|
|
|
|
_ => false,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let queue = Queue::new(&ctx, devices[nth_device].1, None)?;
|
|
|
|
|
let programs = make_programs(&ctx, &queue)?;
|
|
|
|
|
Ok(OpenCL {
|
|
|
|
|
ctx: ctx,
|
|
|
|
|
queue: queue,
|
|
|
|
|
programs: Arc::new(RwLock::new(programs)),
|
|
|
|
|
is_cpu_device,
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@ -306,19 +318,21 @@ impl OpenCLTensor {
|
|
|
|
|
unsafe { self.buf.cmd().fill(0u16, None).block(false).enq()? };
|
|
|
|
|
|
|
|
|
|
let prg = self.cl.programs.write().unwrap();
|
|
|
|
|
prg.matrix_mul_transposed_f16.set_arg(0, self.buf.clone())?;
|
|
|
|
|
prg.matrix_mul_transposed_f16.set_arg(1, src.buf.clone())?;
|
|
|
|
|
prg.matrix_mul_transposed_f16
|
|
|
|
|
.set_arg(2, other.buf.clone())?;
|
|
|
|
|
prg.matrix_mul_transposed_f16
|
|
|
|
|
.set_arg(3, src.cols_capacity as i32)?;
|
|
|
|
|
prg.matrix_mul_transposed_f16
|
|
|
|
|
.set_arg(4, other.cols_capacity as i32)?;
|
|
|
|
|
prg.matrix_mul_transposed_f16
|
|
|
|
|
.set_arg(5, self.cols_capacity as i32)?;
|
|
|
|
|
prg.matrix_mul_transposed_f16.set_arg(6, self.rows as i32)?;
|
|
|
|
|
prg.matrix_mul_transposed_f16.set_arg(7, self.cols as i32)?;
|
|
|
|
|
prg.matrix_mul_transposed_f16.set_arg(8, src.cols as i32)?;
|
|
|
|
|
|
|
|
|
|
let prg = if self.cl.is_cpu_device {
|
|
|
|
|
&prg.matrix_mul_transposed_f16_cpu_optimized
|
|
|
|
|
} else {
|
|
|
|
|
&prg.matrix_mul_transposed_f16
|
|
|
|
|
};
|
|
|
|
|
prg.set_arg(0, self.buf.clone())?;
|
|
|
|
|
prg.set_arg(1, src.buf.clone())?;
|
|
|
|
|
prg.set_arg(2, other.buf.clone())?;
|
|
|
|
|
prg.set_arg(3, src.cols_capacity as i32)?;
|
|
|
|
|
prg.set_arg(4, other.cols_capacity as i32)?;
|
|
|
|
|
prg.set_arg(5, self.cols_capacity as i32)?;
|
|
|
|
|
prg.set_arg(6, self.rows as i32)?;
|
|
|
|
|
prg.set_arg(7, self.cols as i32)?;
|
|
|
|
|
prg.set_arg(8, src.cols as i32)?;
|
|
|
|
|
let mut event = Event::empty();
|
|
|
|
|
|
|
|
|
|
let rows16 = if self.rows % 16 == 0 {
|
|
|
|
|
@ -333,8 +347,15 @@ impl OpenCLTensor {
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
unsafe {
|
|
|
|
|
if self.cl.is_cpu_device {
|
|
|
|
|
let b = prg
|
|
|
|
|
.cmd()
|
|
|
|
|
.queue(&self.queue)
|
|
|
|
|
.global_work_size([self.cols as usize, self.rows as usize])
|
|
|
|
|
.enew(&mut event);
|
|
|
|
|
b.enq()?;
|
|
|
|
|
} else {
|
|
|
|
|
let b = prg
|
|
|
|
|
.matrix_mul_transposed_f16
|
|
|
|
|
.cmd()
|
|
|
|
|
.queue(&self.queue)
|
|
|
|
|
.global_work_size([cols16 as usize, rows16 as usize])
|
|
|
|
|
@ -342,6 +363,7 @@ impl OpenCLTensor {
|
|
|
|
|
.enew(&mut event);
|
|
|
|
|
b.enq()?;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
self.last_event = Some(event.clone());
|
|
|
|
|
Ok(OpenCLEvent { event })
|
|
|
|
|
}
|
|
|
|
|
@ -376,6 +398,22 @@ fn make_programs(ctx: &Context, queue: &Queue) -> Result<Programs, OpenCLError>
|
|
|
|
|
.arg(&0)
|
|
|
|
|
.queue(queue.clone())
|
|
|
|
|
.build()?;
|
|
|
|
|
let matrix_mul_transposed_f16_cpu_optimized_program =
|
|
|
|
|
make_program_with_src(ctx, MATRIX_MUL_TRANSPOSED_F16_CPU_OPTIMIZED_SRC)?;
|
|
|
|
|
let matrix_mul_transposed_f16_cpu_optimized = Kernel::builder()
|
|
|
|
|
.program(&matrix_mul_transposed_f16_cpu_optimized_program)
|
|
|
|
|
.name("matrix_mul_transposed_f16_cpu_optimized")
|
|
|
|
|
.arg(None::<&Buffer<u16>>)
|
|
|
|
|
.arg(None::<&Buffer<u16>>)
|
|
|
|
|
.arg(None::<&Buffer<u16>>)
|
|
|
|
|
.arg(&0)
|
|
|
|
|
.arg(&0)
|
|
|
|
|
.arg(&0)
|
|
|
|
|
.arg(&0)
|
|
|
|
|
.arg(&0)
|
|
|
|
|
.arg(&0)
|
|
|
|
|
.queue(queue.clone())
|
|
|
|
|
.build()?;
|
|
|
|
|
let silu_f16_program = make_program_with_src(ctx, SILU_F16_SRC)?;
|
|
|
|
|
let silu_f16 = Kernel::builder()
|
|
|
|
|
.program(&silu_f16_program)
|
|
|
|
|
@ -407,6 +445,8 @@ fn make_programs(ctx: &Context, queue: &Queue) -> Result<Programs, OpenCLError>
|
|
|
|
|
Ok(Programs {
|
|
|
|
|
matrix_mul_transposed_f16_program,
|
|
|
|
|
matrix_mul_transposed_f16,
|
|
|
|
|
matrix_mul_transposed_f16_cpu_optimized_program,
|
|
|
|
|
matrix_mul_transposed_f16_cpu_optimized,
|
|
|
|
|
silu_f16_program,
|
|
|
|
|
silu_f16,
|
|
|
|
|
hadamard_product_f16_program,
|
|
|
|
|
@ -419,10 +459,6 @@ fn make_programs(ctx: &Context, queue: &Queue) -> Result<Programs, OpenCLError>
|
|
|
|
|
const MATRIX_MUL_TRANSPOSED_F16_SRC: &str = r#"
|
|
|
|
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* TODO: need to read resources like https://cnugteren.github.io/tutorial/pages/page1.html to
|
|
|
|
|
* figure out how matrix multiply faster.
|
|
|
|
|
*/
|
|
|
|
|
__kernel void matrix_mul_transposed_f16(
|
|
|
|
|
__global half *tgt,
|
|
|
|
|
__global const half *left,
|
|
|
|
|
@ -467,6 +503,55 @@ __kernel void matrix_mul_transposed_f16(
|
|
|
|
|
}
|
|
|
|
|
"#;
|
|
|
|
|
|
|
|
|
|
const MATRIX_MUL_TRANSPOSED_F16_CPU_OPTIMIZED_SRC: &str = r#"
|
|
|
|
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
|
|
|
|
|
|
|
|
__kernel void matrix_mul_transposed_f16_cpu_optimized(
|
|
|
|
|
__global half *tgt,
|
|
|
|
|
__global const half *left,
|
|
|
|
|
__global const half *right,
|
|
|
|
|
const int left_cols_capacity,
|
|
|
|
|
const int right_cols_capacity,
|
|
|
|
|
const int ncols_capacity,
|
|
|
|
|
const int nrows,
|
|
|
|
|
const int ncols, // size of target
|
|
|
|
|
const int shared_sz
|
|
|
|
|
) {
|
|
|
|
|
const int tgt_col = get_global_id(0);
|
|
|
|
|
const int tgt_row = get_global_id(1);
|
|
|
|
|
int col_iterations = shared_sz / 16;
|
|
|
|
|
if (shared_sz % 16 != 0) {
|
|
|
|
|
col_iterations = col_iterations + 1;
|
|
|
|
|
}
|
|
|
|
|
float16 sum = 0;
|
|
|
|
|
for (int col16 = 0; col16 < col_iterations; col16++) {
|
|
|
|
|
const float16 left8 = vload_half16((tgt_row * left_cols_capacity)/16 + col16, (__global const half*) left);
|
|
|
|
|
const float16 right8 = vload_half16((tgt_col * right_cols_capacity)/16 + col16, (__global const half*) right);
|
|
|
|
|
// hadamard product FMA add it to sum
|
|
|
|
|
// const float16 result8 = left8 * right8;
|
|
|
|
|
// sum += result8;
|
|
|
|
|
sum = fma(left8, right8, sum);
|
|
|
|
|
}
|
|
|
|
|
// Reduce as accurately as possible
|
|
|
|
|
float sum1 = sum.s0 + sum.s1;
|
|
|
|
|
float sum2 = sum.s2 + sum.s3;
|
|
|
|
|
float sum3 = sum.s4 + sum.s5;
|
|
|
|
|
float sum4 = sum.s6 + sum.s7;
|
|
|
|
|
float sum5 = sum.s8 + sum.s9;
|
|
|
|
|
float sum6 = sum.sa + sum.sb;
|
|
|
|
|
float sum7 = sum.sc + sum.sd;
|
|
|
|
|
float sum8 = sum.se + sum.sf;
|
|
|
|
|
float sum11 = sum1 + sum2;
|
|
|
|
|
float sum12 = sum3 + sum4;
|
|
|
|
|
float sum13 = sum5 + sum6;
|
|
|
|
|
float sum14 = sum7 + sum8;
|
|
|
|
|
float sum21 = sum11 + sum12;
|
|
|
|
|
float sum22 = sum13 + sum14;
|
|
|
|
|
float total = sum21 + sum22;
|
|
|
|
|
vstore_half(total, 0, (__global half*) &tgt[tgt_row * ncols_capacity + tgt_col]);
|
|
|
|
|
}
|
|
|
|
|
"#;
|
|
|
|
|
|
|
|
|
|
/// Computes SILU for every f16 value in the tensor
|
|
|
|
|
const SILU_F16_SRC: &str = r#"
|
|
|
|
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
|
|
|
|
|