@ -2,10 +2,7 @@
* OpenCL stuff to run ( some ) of the tensor operations .
* /
use ocl ::{
enums ::DeviceInfo , enums ::DeviceInfoResult , Buffer , Context , Device , DeviceType , Event , Kernel ,
Platform , Program , Queue ,
} ;
use ocl ::{ Buffer , Context , Device , Event , Kernel , Platform , Program , Queue } ;
use std ::alloc ::Layout ;
use std ::sync ::{ Arc , RwLock } ;
use thiserror ::Error ;
@ -13,16 +10,28 @@ use thiserror::Error;
#[ derive(Debug) ]
#[ allow(dead_code) ]
struct Programs {
matrix_mul_transposed_f16_program : Program ,
matrix_mul_transposed_f16 : Kernel ,
matrix_mul_transposed_f16_cpu_optimized_program : Program ,
matrix_mul_transposed_f16_cpu_optimized : Kernel ,
matrix_mul_transposed_by_row_f16_program : Program ,
matrix_mul_transposed_by_row_f16 : Kernel ,
silu_f16_program : Program ,
silu_f16 : Kernel ,
hadamard_product_f16_program : Program ,
hadamard_product_f16 : Kernel ,
transpose_f16_program : Program ,
transpose_f16 : Kernel ,
pow_f16_program : Program ,
pow_f16 : Kernel ,
mean_cols_f16_program : Program ,
mean_cols_f16 : Kernel ,
add_scalar_f16_program : Program ,
add_scalar_f16 : Kernel ,
scalar_multiply_broadcast_f16_program : Program ,
scalar_multiply_broadcast_f16 : Kernel ,
hadamard_product_broadcast_f16_program : Program ,
hadamard_product_broadcast_f16 : Kernel ,
rsqrt_f16_program : Program ,
rsqrt_f16 : Kernel ,
add_f16_program : Program ,
add_f16 : Kernel ,
}
#[ derive(Debug, Clone) ]
@ -31,7 +40,6 @@ pub struct OpenCL {
ctx : Context ,
queue : Queue ,
programs : Arc < RwLock < Programs > > ,
is_cpu_device : bool ,
}
#[ derive(Debug) ]
@ -110,18 +118,12 @@ impl OpenCL {
. devices ( devices [ nth_device ] . 1 )
. build ( ) ? ;
let is_cpu_device = match devices [ nth_device ] . 1. info ( DeviceInfo ::Type ) ? {
DeviceInfoResult ::Type ( DeviceType ::CPU ) = > true ,
_ = > false ,
} ;
let queue = Queue ::new ( & ctx , devices [ nth_device ] . 1 , None ) ? ;
let programs = make_programs ( & ctx , & queue ) ? ;
Ok ( OpenCL {
ctx : ctx ,
queue : queue ,
programs : Arc ::new ( RwLock ::new ( programs ) ) ,
is_cpu_device ,
} )
}
@ -229,6 +231,58 @@ impl OpenCLTensor {
Ok ( OpenCLEvent { event } )
}
pub fn add_scalar_inplace ( & mut self , scalar : f32 ) -> Result < OpenCLEvent , OpenCLError > {
let prg = self . cl . programs . write ( ) . unwrap ( ) ;
prg . add_scalar_f16 . set_arg ( 0 , self . buf . clone ( ) ) . unwrap ( ) ;
prg . add_scalar_f16
. set_arg ( 1 , self . cols_capacity as i32 )
. unwrap ( ) ;
prg . add_scalar_f16 . set_arg ( 2 , scalar ) . unwrap ( ) ;
let mut event = Event ::empty ( ) ;
unsafe {
let b = prg
. add_scalar_f16
. cmd ( )
. queue ( & self . queue )
. global_work_size ( [ self . rows as usize , self . cols as usize ] )
. enew ( & mut event ) ;
b . enq ( ) ? ;
}
self . last_event = Some ( event . clone ( ) ) ;
Ok ( OpenCLEvent { event } )
}
pub fn scalar_multiply_broadcast_inplace (
& mut self ,
other : & OpenCLTensor ,
) -> Result < OpenCLEvent , OpenCLError > {
let prg = self . cl . programs . write ( ) . unwrap ( ) ;
prg . scalar_multiply_broadcast_f16
. set_arg ( 0 , self . buf . clone ( ) )
. unwrap ( ) ;
prg . scalar_multiply_broadcast_f16
. set_arg ( 1 , other . buf . clone ( ) )
. unwrap ( ) ;
prg . scalar_multiply_broadcast_f16
. set_arg ( 2 , self . cols_capacity as i32 )
. unwrap ( ) ;
prg . scalar_multiply_broadcast_f16
. set_arg ( 3 , other . cols_capacity as i32 )
. unwrap ( ) ;
let mut event = Event ::empty ( ) ;
unsafe {
let b = prg
. scalar_multiply_broadcast_f16
. cmd ( )
. queue ( & self . queue )
. global_work_size ( [ self . rows as usize , ( self . cols_capacity / 16 ) as usize ] )
. enew ( & mut event ) ;
b . enq ( ) ? ;
}
self . last_event = Some ( event . clone ( ) ) ;
Ok ( OpenCLEvent { event } )
}
pub fn transpose_from ( & mut self , other : & OpenCLTensor ) -> Result < OpenCLEvent , OpenCLError > {
let prg = self . cl . programs . write ( ) . unwrap ( ) ;
prg . transpose_f16 . set_arg ( 0 , self . buf . clone ( ) ) . unwrap ( ) ;
@ -247,7 +301,7 @@ impl OpenCLTensor {
. queue ( & self . queue )
. global_work_size ( [ self . rows as usize , self . cols as usize ] )
. enew ( & mut event ) ;
b . enq ( ) . unwrap ( ) ;
b . enq ( ) ? ;
}
self . last_event = Some ( event . clone ( ) ) ;
Ok ( OpenCLEvent { event } )
@ -278,6 +332,85 @@ impl OpenCLTensor {
Ok ( OpenCLEvent { event } )
}
pub fn hadamard_product_broadcast_inplace (
& mut self ,
other : & OpenCLTensor ,
) -> Result < OpenCLEvent , OpenCLError > {
let prg = self . cl . programs . write ( ) . unwrap ( ) ;
prg . hadamard_product_broadcast_f16
. set_arg ( 0 , self . buf . clone ( ) ) ? ;
prg . hadamard_product_broadcast_f16
. set_arg ( 1 , other . buf . clone ( ) ) ? ;
prg . hadamard_product_broadcast_f16
. set_arg ( 2 , self . cols_capacity as i32 ) ? ;
prg . hadamard_product_broadcast_f16
. set_arg ( 3 , other . cols_capacity as i32 ) ? ;
let mut event = Event ::empty ( ) ;
unsafe {
let b = prg
. hadamard_product_broadcast_f16
. cmd ( )
. queue ( & self . queue )
. global_work_size ( [ self . rows as usize , ( self . cols_capacity as usize ) / 16 ] )
. enew ( & mut event ) ;
b . enq ( ) ? ;
}
self . last_event = Some ( event . clone ( ) ) ;
Ok ( OpenCLEvent { event } )
}
pub fn mean_cols_from ( & mut self , other : & OpenCLTensor ) -> Result < OpenCLEvent , OpenCLError > {
if self . cols ! = 1 {
panic! (
"mean_cols_from: number of columns in target is not 1: {}" ,
self . cols
) ;
}
if self . rows ! = other . rows {
panic! (
"mean_cols_from: number of rows in target is not equal to number of rows in source: {} != {}" ,
self . rows , other . rows
) ;
}
let prg = self . cl . programs . write ( ) . unwrap ( ) ;
prg . mean_cols_f16 . set_arg ( 0 , self . buf . clone ( ) ) ? ;
prg . mean_cols_f16 . set_arg ( 1 , other . buf . clone ( ) ) ? ;
prg . mean_cols_f16 . set_arg ( 2 , self . cols_capacity as i32 ) ? ;
prg . mean_cols_f16 . set_arg ( 3 , other . cols_capacity as i32 ) ? ;
prg . mean_cols_f16 . set_arg ( 4 , other . cols as i32 ) ? ;
let mut event = Event ::empty ( ) ;
unsafe {
let b = prg
. mean_cols_f16
. cmd ( )
. queue ( & self . queue )
. global_work_size ( [ self . rows as usize , 1 ] )
. enew ( & mut event ) ;
b . enq ( ) ? ;
}
self . last_event = Some ( event . clone ( ) ) ;
Ok ( OpenCLEvent { event } )
}
pub fn pow_inplace ( & mut self , scalar : f32 ) -> Result < OpenCLEvent , OpenCLError > {
let prg = self . cl . programs . write ( ) . unwrap ( ) ;
prg . pow_f16 . set_arg ( 0 , self . buf . clone ( ) ) ? ;
prg . pow_f16 . set_arg ( 1 , self . cols_capacity as i32 ) ? ;
prg . pow_f16 . set_arg ( 2 , scalar ) ? ;
let mut event = Event ::empty ( ) ;
unsafe {
let b = prg
. pow_f16
. cmd ( )
. queue ( & self . queue )
. global_work_size ( [ self . rows as usize , self . cols as usize ] )
. enew ( & mut event ) ;
b . enq ( ) ? ;
}
self . last_event = Some ( event . clone ( ) ) ;
Ok ( OpenCLEvent { event } )
}
pub fn silu_inplace ( & mut self ) -> Result < OpenCLEvent , OpenCLError > {
let prg = self . cl . programs . write ( ) . unwrap ( ) ;
prg . silu_f16 . set_arg ( 0 , self . buf . clone ( ) ) ? ;
@ -296,6 +429,44 @@ impl OpenCLTensor {
Ok ( OpenCLEvent { event } )
}
pub fn add_inplace ( & mut self , left : & OpenCLTensor ) -> Result < OpenCLEvent , OpenCLError > {
let prg = self . cl . programs . write ( ) . unwrap ( ) ;
prg . add_f16 . set_arg ( 0 , self . buf . clone ( ) ) ? ;
prg . add_f16 . set_arg ( 1 , left . buf . clone ( ) ) ? ;
prg . add_f16 . set_arg ( 2 , self . cols_capacity as i32 ) ? ;
prg . add_f16 . set_arg ( 3 , left . cols_capacity as i32 ) ? ;
let mut event = Event ::empty ( ) ;
unsafe {
let b = prg
. add_f16
. cmd ( )
. queue ( & self . queue )
. global_work_size ( [ self . rows as usize , self . cols as usize ] )
. enew ( & mut event ) ;
b . enq ( ) ? ;
}
self . last_event = Some ( event . clone ( ) ) ;
Ok ( OpenCLEvent { event } )
}
pub fn rsqrt_inplace ( & mut self ) -> Result < OpenCLEvent , OpenCLError > {
let prg = self . cl . programs . write ( ) . unwrap ( ) ;
prg . rsqrt_f16 . set_arg ( 0 , self . buf . clone ( ) ) ? ;
prg . rsqrt_f16 . set_arg ( 1 , self . cols_capacity as i32 ) ? ;
let mut event = Event ::empty ( ) ;
unsafe {
let b = prg
. rsqrt_f16
. cmd ( )
. queue ( & self . queue )
. global_work_size ( [ self . rows as usize , self . cols as usize ] )
. enew ( & mut event ) ;
b . enq ( ) ? ;
}
self . last_event = Some ( event . clone ( ) ) ;
Ok ( OpenCLEvent { event } )
}
pub fn matrix_mul_inplace_transposed (
& mut self ,
src : & OpenCLTensor ,
@ -314,69 +485,38 @@ impl OpenCLTensor {
) ;
}
// Clear out the target memory .
// Clear out the target memory
unsafe { self . buf . cmd ( ) . fill ( 0 u16 , None ) . block ( false ) . enq ( ) ? } ;
let prg = self . cl . programs . write ( ) . unwrap ( ) ;
// 0 = CPU optimized
// 1 = GPU optimized
// 2 = GPU optimized vector multiply (other.rows == 1)
const CPU : u8 = 0 ;
const GPU : u8 = 1 ;
let strategy : u8 = if self . cl . is_cpu_device { CPU } else { GPU } ;
let prg = if strategy = = CPU {
& prg . matrix_mul_transposed_f16_cpu_optimized
} else {
& prg . matrix_mul_transposed_f16
} ;
prg . set_arg ( 0 , self . buf . clone ( ) ) ? ;
prg . set_arg ( 1 , src . buf . clone ( ) ) ? ;
prg . set_arg ( 2 , other . buf . clone ( ) ) ? ;
prg . set_arg ( 3 , src . cols_capacity as i32 ) ? ;
prg . set_arg ( 4 , other . cols_capacity as i32 ) ? ;
prg . set_arg ( 5 , self . cols_capacity as i32 ) ? ;
prg . set_arg ( 6 , self . rows as i32 ) ? ;
prg . set_arg ( 7 , self . cols as i32 ) ? ;
prg . set_arg ( 8 , src . cols as i32 ) ? ;
prg . matrix_mul_transposed_by_row_f16
. set_arg ( 0 , self . buf . clone ( ) ) ? ;
prg . matrix_mul_transposed_by_row_f16
. set_arg ( 1 , src . buf . clone ( ) ) ? ;
prg . matrix_mul_transposed_by_row_f16
. set_arg ( 2 , other . buf . clone ( ) ) ? ;
prg . matrix_mul_transposed_by_row_f16
. set_arg ( 3 , src . cols_capacity as i32 ) ? ;
prg . matrix_mul_transposed_by_row_f16
. set_arg ( 4 , other . cols_capacity as i32 ) ? ;
prg . matrix_mul_transposed_by_row_f16
. set_arg ( 5 , self . cols_capacity as i32 ) ? ;
prg . matrix_mul_transposed_by_row_f16
. set_arg ( 6 , self . rows as i32 ) ? ;
prg . matrix_mul_transposed_by_row_f16
. set_arg ( 7 , self . cols as i32 ) ? ;
prg . matrix_mul_transposed_by_row_f16
. set_arg ( 8 , src . cols as i32 ) ? ;
let mut event = Event ::empty ( ) ;
let rows16 = if self . rows % 16 = = 0 {
self . rows
} else {
self . rows + 16 - ( self . rows % 16 )
} ;
let cols16 = if self . cols % 16 = = 0 {
self . cols
} else {
self . cols + 16 - ( self . cols % 16 )
} ;
unsafe {
if strategy = = CPU {
let b = prg
. cmd ( )
. queue ( & self . queue )
. global_work_size ( [ self . cols as usize , self . rows as usize ] )
. enew ( & mut event ) ;
b . enq ( ) ? ;
} else if strategy = = GPU {
let b = prg
. cmd ( )
. queue ( & self . queue )
. global_work_size ( [ cols16 as usize , rows16 as usize ] )
. local_work_size ( [ 16 , 16 ] )
. enew ( & mut event ) ;
b . enq ( ) ? ;
} else {
let b = prg
. cmd ( )
. queue ( & self . queue )
. global_work_size ( [ self . cols as usize , self . rows as usize ] )
. enew ( & mut event ) ;
b . enq ( ) ? ;
}
let b = prg
. matrix_mul_transposed_by_row_f16
. cmd ( )
. queue ( & self . queue )
. global_work_size ( [ self . rows as usize , self . cols as usize ] )
. enew ( & mut event ) ;
b . enq ( ) ? ;
}
self . last_event = Some ( event . clone ( ) ) ;
Ok ( OpenCLEvent { event } )
@ -396,27 +536,11 @@ fn make_programs(ctx: &Context, queue: &Queue) -> Result<Programs, OpenCLError>
Ok ( program )
}
let matrix_mul_transposed_f16_program =
make_program_with_src ( ctx , MATRIX_MUL_TRANSPOSED_F16_SRC ) ? ;
let matrix_mul_transposed_f16 = Kernel ::builder ( )
. program ( & matrix_mul_transposed_f16_program )
. name ( "matrix_mul_transposed_f16" )
. arg ( None ::< & Buffer < u16 > > )
. arg ( None ::< & Buffer < u16 > > )
. arg ( None ::< & Buffer < u16 > > )
. arg ( & 0 )
. arg ( & 0 )
. arg ( & 0 )
. arg ( & 0 )
. arg ( & 0 )
. arg ( & 0 )
. queue ( queue . clone ( ) )
. build ( ) ? ;
let matrix_mul_transposed_f16_cpu_optimized_program =
make_program_with_src ( ctx , MATRIX_MUL_TRANSPOSED_F16_CPU_OPTIMIZED_SRC ) ? ;
let matrix_mul_transposed_f16_cpu_optimized = Kernel ::builder ( )
. program ( & matrix_mul_transposed_f16_cpu_optimized_program )
. name ( "matrix_mul_transposed_f16_cpu_optimized" )
let matrix_mul_transposed_by_row_f16_program =
make_program_with_src ( ctx , MATRIX_MUL_TRANSPOSED_BY_ROW_F16_SRC ) ? ;
let matrix_mul_transposed_by_row_f16 = Kernel ::builder ( )
. program ( & matrix_mul_transposed_by_row_f16_program )
. name ( "matrix_mul_transposed_by_row_f16" )
. arg ( None ::< & Buffer < u16 > > )
. arg ( None ::< & Buffer < u16 > > )
. arg ( None ::< & Buffer < u16 > > )
@ -456,71 +580,124 @@ fn make_programs(ctx: &Context, queue: &Queue) -> Result<Programs, OpenCLError>
. arg ( & 0 )
. queue ( queue . clone ( ) )
. build ( ) ? ;
let pow_f16_program = make_program_with_src ( ctx , POW_F16_SRC ) ? ;
let pow_f16 = Kernel ::builder ( )
. program ( & pow_f16_program )
. name ( "pow_f16" )
. arg ( None ::< & Buffer < u16 > > )
. arg ( & 0 )
. arg ( & 0 )
. queue ( queue . clone ( ) )
. build ( ) ? ;
let mean_cols_f16_program = make_program_with_src ( ctx , MEAN_COLS_F16_SRC ) ? ;
let mean_cols_f16 = Kernel ::builder ( )
. program ( & mean_cols_f16_program )
. name ( "mean_cols_f16" )
. arg ( None ::< & Buffer < u16 > > )
. arg ( None ::< & Buffer < u16 > > )
. arg ( & 0 )
. arg ( & 0 )
. arg ( & 0 )
. queue ( queue . clone ( ) )
. build ( ) ? ;
let add_scalar_f16_program = make_program_with_src ( ctx , ADD_SCALAR_F16_SRC ) ? ;
let add_scalar_f16 = Kernel ::builder ( )
. program ( & add_scalar_f16_program )
. name ( "add_scalar_f16" )
. arg ( None ::< & Buffer < u16 > > )
. arg ( & 0 )
. arg ( & 0 )
. queue ( queue . clone ( ) )
. build ( ) ? ;
let scalar_multiply_broadcast_f16_program =
make_program_with_src ( ctx , SCALAR_MULTIPLY_BROADCAST_F16_SRC ) ? ;
let scalar_multiply_broadcast_f16 = Kernel ::builder ( )
. program ( & scalar_multiply_broadcast_f16_program )
. name ( "scalar_multiply_broadcast_f16" )
. arg ( None ::< & Buffer < u16 > > )
. arg ( None ::< & Buffer < u16 > > )
. arg ( & 0 )
. arg ( & 0 )
. queue ( queue . clone ( ) )
. build ( ) ? ;
let hadamard_product_broadcast_f16_program =
make_program_with_src ( ctx , HADAMARD_PRODUCT_BROADCAST_F16_SRC ) ? ;
let hadamard_product_broadcast_f16 = Kernel ::builder ( )
. program ( & hadamard_product_broadcast_f16_program )
. name ( "hadamard_product_broadcast_f16" )
. arg ( None ::< & Buffer < u16 > > )
. arg ( None ::< & Buffer < u16 > > )
. arg ( & 0 )
. arg ( & 0 )
. queue ( queue . clone ( ) )
. build ( ) ? ;
let rsqrt_f16_program = make_program_with_src ( ctx , RSQRT_F16_SRC ) ? ;
let rsqrt_f16 = Kernel ::builder ( )
. program ( & rsqrt_f16_program )
. name ( "rsqrt_f16" )
. arg ( None ::< & Buffer < u16 > > )
. arg ( & 0 )
. queue ( queue . clone ( ) )
. build ( ) ? ;
let add_f16_program = make_program_with_src ( ctx , ADD_F16_SRC ) ? ;
let add_f16 = Kernel ::builder ( )
. program ( & add_f16_program )
. name ( "add_f16" )
. arg ( None ::< & Buffer < u16 > > )
. arg ( None ::< & Buffer < u16 > > )
. arg ( & 0 )
. arg ( & 0 )
. queue ( queue . clone ( ) )
. build ( ) ? ;
Ok ( Programs {
matrix_mul_transposed_f16_program ,
matrix_mul_transposed_f16 ,
matrix_mul_transposed_f16_cpu_optimized_program ,
matrix_mul_transposed_f16_cpu_optimized ,
matrix_mul_transposed_by_row_f16_program ,
matrix_mul_transposed_by_row_f16 ,
silu_f16_program ,
silu_f16 ,
hadamard_product_f16_program ,
hadamard_product_f16 ,
transpose_f16_program ,
transpose_f16 ,
pow_f16_program ,
pow_f16 ,
mean_cols_f16_program ,
mean_cols_f16 ,
add_scalar_f16_program ,
add_scalar_f16 ,
scalar_multiply_broadcast_f16_program ,
scalar_multiply_broadcast_f16 ,
hadamard_product_broadcast_f16_program ,
hadamard_product_broadcast_f16 ,
rsqrt_f16_program ,
rsqrt_f16 ,
add_f16_program ,
add_f16 ,
} )
}
const MATRIX_MUL_TRANSPOSED_F16_SRC : & str = r #"
const MATRIX_MUL_TRANSPOSED_ BY_ROW_ F16_SRC: & str = r #"
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void matrix_mul_transposed_f16 (
__global half * tgt ,
__global const half * left ,
__global const half * right ,
const int left_cols_capacity ,
const int right_cols_capacity ,
const int ncols_capacity ,
const int nrows ,
const int ncols , // size of target
const int shared_sz
) {
__local float lefttile [ 16 ] [ 16 ] ;
__local float righttile [ 16 ] [ 16 ] ;
const int global_x = get_global_id ( 0 ) ;
const int global_y = get_global_id ( 1 ) ;
const int local_x = get_local_id ( 0 ) ;
const int local_y = get_local_id ( 1 ) ;
const int num_tiles = ( shared_sz + 15 ) / 16 ;
float sum = 0.0 f ;
for ( int t = 0 ; t < num_tiles ; + + t ) {
if ( global_y < nrows ) {
lefttile [ local_y ] [ local_x ] = vload_half ( global_y * left_cols_capacity + t * 16 + local_x , left ) ;
} else {
lefttile [ local_y ] [ local_x ] = 0.0 f ;
}
if ( global_x < ncols ) {
righttile [ local_y ] [ local_x ] = vload_half ( global_x * right_cols_capacity + t * 16 + local_y , right ) ;
} else {
righttile [ local_y ] [ local_x ] = 0.0 f ;
}
barrier ( CLK_LOCAL_MEM_FENCE ) ;
for ( int k = 0 ; k < 16 ; + + k ) {
sum + = lefttile [ local_y ] [ k ] * righttile [ k ] [ local_x ] ;
}
barrier ( CLK_LOCAL_MEM_FENCE ) ;
}
if ( global_x < ncols & & global_y < nrows ) {
vstore_half ( sum , global_y * ncols_capacity + global_x , ( __global half * ) tgt ) ;
}
}
" #;
const MATRIX_MUL_TRANSPOSED_F16_CPU_OPTIMIZED_SRC : & str = r #"
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void matrix_mul_transposed_f16_cpu_optimized (
/*
* Matrix multiplication with a transposed second matrix , using 16 - bit floats .
*
* One work unit per row .
*
* Assumes that each row in the matrices are zero - padded so that there ' s space for 32 bytes ( or 16
* halfs ) of data and we don ' t need to care if our loops go over the bounds .
*
* Operations are done in float32 .
*
* This thing is not very fast right now . I compared with PyTorch and this is like 20 x slower . It
* is still much faster than CPU . Not sure PyTorch uses cuBlas but if we could get at least
* somewhere like 50 % of that speed I would be happy .
*
* The OpenCL on CPU for Ryzen 3950 X seems to easily beat my own AVX2 operations .
*
* TODO : need to read resources like https ://cnugteren.github.io/tutorial/pages/page1.html to
* figure out how matrix multiply faster .
* /
__kernel void matrix_mul_transposed_by_row_f16 (
__global half * tgt ,
__global const half * left ,
__global const half * right ,
@ -531,12 +708,14 @@ __kernel void matrix_mul_transposed_f16_cpu_optimized(
const int ncols , // size of target
const int shared_sz
) {
const int tgt_col = get_global_id ( 0 ) ;
const int tgt_row = get_global_id ( 1 ) ;
int col_iterations = shared_sz / 16 ;
if ( shared_sz % 16 ! = 0 ) {
col_iterations = col_iterations + 1 ;
col_iterations = col_iterations + 1 ;
}
const int tgt_row = get_global_id ( 0 ) ;
const int tgt_col = get_global_id ( 1 ) ;
float16 sum = 0 ;
for ( int col16 = 0 ; col16 < col_iterations ; col16 + + ) {
const float16 left8 = vload_half16 ( ( tgt_row * left_cols_capacity ) / 16 + col16 , ( __global const half * ) left ) ;
@ -555,13 +734,17 @@ __kernel void matrix_mul_transposed_f16_cpu_optimized(
float sum6 = sum . sa + sum . sb ;
float sum7 = sum . sc + sum . sd ;
float sum8 = sum . se + sum . sf ;
float sum11 = sum1 + sum2 ;
float sum12 = sum3 + sum4 ;
float sum13 = sum5 + sum6 ;
float sum14 = sum7 + sum8 ;
float sum21 = sum11 + sum12 ;
float sum22 = sum13 + sum14 ;
float total = sum21 + sum22 ;
vstore_half ( total , 0 , ( __global half * ) & tgt [ tgt_row * ncols_capacity + tgt_col ] ) ;
}
" #;
@ -615,3 +798,131 @@ __kernel void transpose_f16(__global half *tgt,
vstore_half ( val , tgt_row * ncols_capacity + tgt_col , ( __global half * ) tgt ) ;
}
" #;
/// Computes x^scalar for every f16 value in the tensor
const POW_F16_SRC : & str = r #"
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void pow_f16 ( __global half * tgt ,
const int ncols_capacity ,
const float scalar )
{
const int tgt_row = get_global_id ( 0 ) ;
const int tgt_col = get_global_id ( 1 ) ;
const float val = vload_half ( tgt_row * ncols_capacity + tgt_col , ( __global const half * ) tgt ) ;
const float result = pow ( val , scalar ) ;
vstore_half ( result , tgt_row * ncols_capacity + tgt_col , ( __global half * ) tgt ) ;
}
" #;
/// Computes the mean of each column in a tensor
const MEAN_COLS_F16_SRC : & str = r #"
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void mean_cols_f16 ( __global half * tgt ,
__global const half * left ,
const int ncols_capacity ,
const int left_cols_capacity ,
const int ncolumns )
{
// global work group size is nrows x 1
const int row = get_global_id ( 0 ) ;
float16 src_value = 0.0 ;
for ( int col16 = 0 ; col16 < left_cols_capacity ; col16 + = 16 ) {
const int actual_col = col16 ;
if ( actual_col > = ncolumns ) {
break ;
}
src_value + = vload_half16 ( ( row * left_cols_capacity ) / 16 + col16 / 16 , ( __global const half * ) left ) ;
}
float src_value_sum = src_value . s0 + src_value . s1 + src_value . s2 + src_value . s3 + src_value . s4 + src_value . s5 + src_value . s6 + src_value . s7 + src_value . s8 + src_value . s9 + src_value . sa + src_value . sb + src_value . sc + src_value . sd + src_value . se + src_value . sf ;
src_value_sum = src_value_sum / ( float ) ncolumns ;
vstore_half ( src_value_sum , row * ncols_capacity , ( __global half * ) tgt ) ;
}
" #;
/// Adds a scalar to a tensor
const ADD_SCALAR_F16_SRC : & str = r #"
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void add_scalar_f16 ( __global half * tgt , const int ncols_capacity , const float scalar )
{
const int tgt_row = get_global_id ( 0 ) ;
const int tgt_col = get_global_id ( 1 ) ;
const float val = vload_half ( tgt_row * ncols_capacity + tgt_col , ( __global const half * ) tgt ) ;
const float result = val + scalar ;
vstore_half ( result , tgt_row * ncols_capacity + tgt_col , ( __global half * ) tgt ) ;
}
" #;
/// Adds scalars from a row vector to each row of a tensor
const SCALAR_MULTIPLY_BROADCAST_F16_SRC : & str = r #"
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void scalar_multiply_broadcast_f16 ( __global half * tgt ,
__global const half * left ,
const int ncols_capacity ,
const int left_cols_capacity )
{
// global work group size is nrows x (ncols/16)
const int row = get_global_id ( 0 ) ;
const int col = get_global_id ( 1 ) * 16 ;
const float scalar = vload_half ( row * left_cols_capacity , ( __global const half * ) left ) ;
float16 src_value = vload_half16 ( ( row * ncols_capacity ) / 16 + col / 16 , ( __global const half * ) tgt ) * scalar ;
vstore_half16 ( src_value , ( row * ncols_capacity ) / 16 + col / 16 , ( __global half * ) tgt ) ;
}
" #;
/// Does a hadamard product from a column vector to each column of a tensor
const HADAMARD_PRODUCT_BROADCAST_F16_SRC : & str = r #"
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void hadamard_product_broadcast_f16 ( __global half * tgt ,
__global const half * left ,
const int ncols_capacity ,
const int left_cols_capacity )
{
// global work group size is nrows x (ncols/16)
const int row = get_global_id ( 0 ) ;
const int col16 = get_global_id ( 1 ) * 16 ;
const float16 product_value = vload_half16 ( col16 / 16 , ( __global const half * ) left ) ;
const float16 src_value = vload_half16 ( ( row * ncols_capacity ) / 16 + col16 / 16 , ( __global const half * ) tgt ) ;
const float16 result = src_value * product_value ;
vstore_half16 ( result , ( row * ncols_capacity ) / 16 + col16 / 16 , ( __global half * ) tgt ) ;
}
" #;
/// Computes 1/sqrt(x) for each f16 value in the tensor
const RSQRT_F16_SRC : & str = r #"
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void rsqrt_f16 ( __global half * tgt , const int ncols_capacity )
{
const int tgt_row = get_global_id ( 0 ) ;
const int tgt_col = get_global_id ( 1 ) ;
const float val = vload_half ( tgt_row * ncols_capacity + tgt_col , ( __global const half * ) tgt ) ;
const float result = rsqrt ( val ) ;
vstore_half ( result , tgt_row * ncols_capacity + tgt_col , ( __global half * ) tgt ) ;
}
" #;
/// Computes sum of two tensors
const ADD_F16_SRC : & str = r #"
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void add_f16 ( __global half * tgt ,
__global const half * left ,
const int tgt_ncols_capacity ,
const int left_ncols_capacity )
{
const int tgt_row = get_global_id ( 0 ) ;
const int tgt_col = get_global_id ( 1 ) ;
const float tgt_v = vload_half ( tgt_row * tgt_ncols_capacity + tgt_col , ( __global const half * ) tgt ) ;
const float left_v = vload_half ( tgt_row * left_ncols_capacity + tgt_col , ( __global const half * ) left ) ;
const float result = tgt_v + left_v ;
vstore_half ( result , tgt_row * tgt_ncols_capacity + tgt_col , ( __global half * ) tgt ) ;
}
" #;