Add some initial OpenCL stuff.

I can copy tensors to GPU and back but not much more. Maybe next time
I'll try implementing matrix_mul_transposed or something on the GPU.
broken-opencl-code
Mikko Juola 3 years ago
parent 53d367e6fa
commit a92017bf56

@ -8,13 +8,32 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion};
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
pub fn opencl_benchmarks(c: &mut Criterion) { pub fn opencl_benchmarks(c: &mut Criterion) {
let orig16 = Tensor::random(1024, 1024, TensorDType::Float16); let mut orig1 = Tensor::random(1, 1, TensorDType::Float16);
let mut orig16 = Tensor::random(1024, 1024, TensorDType::Float16);
let mut orig32 = Tensor::random(4096, 4096, TensorDType::Float16);
let cl = OpenCL::new(false, 0).unwrap(); let cl = OpenCL::new(false, 0).unwrap();
c.bench_function("1024x1024 matrix from CPU to OpenCL device", |b| { c.bench_function("1x1 matrix from CPU to OpenCL device and back", |b| {
b.iter(|| { b.iter(|| {
let mut orig16 = orig16.clone(); let _ = orig1.to_gpu(&cl).unwrap();
let _ = orig16.to_gpu(&cl); let _ = orig1.to_cpu();
orig1.process_waiting_for_data();
})
});
c.bench_function("1024x1024 matrix from CPU to OpenCL device and back", |b| {
b.iter(|| {
let _ = orig16.to_gpu(&cl).unwrap();
let _ = orig16.to_cpu();
orig16.process_waiting_for_data();
})
});
c.bench_function("4096x4096 matrix from CPU to OpenCL device and back", |b| {
b.iter(|| {
let _ = orig32.to_gpu(&cl).unwrap();
let _ = orig32.to_cpu();
orig32.process_waiting_for_data();
}) })
}); });
} }

@ -1,5 +1,5 @@
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
use crate::tensor_opencl_support::{OpenCL, OpenCLError, OpenCLTensor}; use crate::tensor_opencl_support::{OpenCL, OpenCLError, OpenCLEvent, OpenCLTensor};
use crate::unpickler; use crate::unpickler;
use crate::unpickler::UnpicklingError; use crate::unpickler::UnpicklingError;
use half::f16; use half::f16;
@ -9,6 +9,8 @@ use std::alloc::Layout;
use std::arch::x86_64::*; use std::arch::x86_64::*;
use std::io::{Read, Seek}; use std::io::{Read, Seek};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
#[cfg(feature = "opencl")]
use std::sync::{Arc, RwLock};
use thiserror::Error; use thiserror::Error;
#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)] #[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
@ -60,7 +62,9 @@ impl TensorDType {
pub struct Tensor { pub struct Tensor {
data: *mut u8, data: *mut u8,
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
opencl_data: Option<OpenCLTensor>, opencl_data: Arc<RwLock<Option<OpenCLTensor>>>,
#[cfg(feature = "opencl")]
waiting_for_data: Option<OpenCLEvent>, // Is OpenCL in process of sending data back to CPU?
dtype: TensorDType, dtype: TensorDType,
layout: Layout, layout: Layout,
@ -76,6 +80,18 @@ unsafe impl Sync for Tensor {}
impl Clone for Tensor { impl Clone for Tensor {
fn clone(&self) -> Self { fn clone(&self) -> Self {
#[cfg(feature = "opencl")]
{
if let Some(ref wfd) = self.waiting_for_data {
wfd.wait();
let mut od = self.opencl_data.write().unwrap();
*od = None;
}
let od = self.opencl_data.read().unwrap();
if od.is_some() {
panic!("Tried to clone a tensor that is on the GPU");
}
}
unsafe { unsafe {
let new_tensor = Tensor::uninitialized(self.rows, self.cols, self.dtype); let new_tensor = Tensor::uninitialized(self.rows, self.cols, self.dtype);
std::ptr::copy_nonoverlapping( std::ptr::copy_nonoverlapping(
@ -90,6 +106,8 @@ impl Clone for Tensor {
impl Drop for Tensor { impl Drop for Tensor {
fn drop(&mut self) { fn drop(&mut self) {
#[cfg(feature = "opencl")]
self.process_waiting_for_data_mut();
unsafe { unsafe {
if !self.data.is_null() { if !self.data.is_null() {
std::alloc::dealloc(self.data, self.layout); std::alloc::dealloc(self.data, self.layout);
@ -137,7 +155,9 @@ impl Tensor {
pub fn assume_on_cpu(&self) { pub fn assume_on_cpu(&self) {
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
{ {
if self.opencl_data.is_some() { self.process_waiting_for_data();
let od = self.opencl_data.read().unwrap();
if od.is_some() {
panic!("Tried to assume_on_cpu on a tensor that is on the GPU"); panic!("Tried to assume_on_cpu on a tensor that is on the GPU");
} }
} }
@ -252,7 +272,9 @@ impl Tensor {
Self { Self {
data: std::ptr::null_mut(), data: std::ptr::null_mut(),
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
opencl_data: None, opencl_data: Arc::new(RwLock::new(None)),
#[cfg(feature = "opencl")]
waiting_for_data: None,
dtype: TensorDType::Float16, dtype: TensorDType::Float16,
layout: Layout::from_size_align(0, 0).unwrap(), layout: Layout::from_size_align(0, 0).unwrap(),
rows: 0, rows: 0,
@ -299,7 +321,9 @@ impl Tensor {
Self { Self {
data, data,
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
opencl_data: None, opencl_data: Arc::new(RwLock::new(None)),
#[cfg(feature = "opencl")]
waiting_for_data: None,
dtype, dtype,
rows, rows,
cols, cols,
@ -1121,7 +1145,9 @@ impl Tensor {
Self { Self {
data, data,
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
opencl_data: None, opencl_data: Arc::new(RwLock::new(None)),
#[cfg(feature = "opencl")]
waiting_for_data: None,
dtype, dtype,
rows, rows,
cols, cols,
@ -1213,17 +1239,18 @@ impl Tensor {
} }
} }
/// Sends a tensor to the GPU. This is a no-op if GPU support is not enabled, or if the tensor /// Sends a tensor to the GPU. This is a no-op if the tensor is already on the GPU.
/// is already on the GPU.
/// ///
/// The tensor is moved asynchronously. /// The tensor is moved asynchronously.
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
pub fn to_gpu(&mut self, cl: &OpenCL) -> Result<(), TensorError> { pub fn to_gpu(&mut self, cl: &OpenCL) -> Result<(), TensorError> {
if self.opencl_data.is_some() { self.process_waiting_for_data_mut();
let mut od = self.opencl_data.write().unwrap();
if od.is_some() {
return Ok(()); return Ok(());
} }
if self.dtype != TensorDType::Float16 { if self.dtype != TensorDType::Float16 {
panic!("Only float16 tensors are supported on the GPU"); panic!("to_gpu: Only float16 tensors are supported on the GPU");
} }
let cl_tensor = cl.data_u16_to_gpu( let cl_tensor = cl.data_u16_to_gpu(
self.data as *const u16, self.data as *const u16,
@ -1231,17 +1258,56 @@ impl Tensor {
(self.rows * self.capacity_cols) as usize, (self.rows * self.capacity_cols) as usize,
)?; )?;
self.data = std::ptr::null_mut(); self.data = std::ptr::null_mut();
self.opencl_data = Some(cl_tensor); *od = Some(cl_tensor);
Ok(())
}
#[cfg(feature = "opencl")]
pub fn process_waiting_for_data_mut(&mut self) {
if let Some(ref wfd) = self.waiting_for_data {
wfd.wait();
let mut od = self.opencl_data.write().unwrap();
*od = None;
}
self.waiting_for_data = None;
}
#[cfg(feature = "opencl")]
pub fn process_waiting_for_data(&self) {
if let Some(ref wfd) = self.waiting_for_data {
wfd.wait();
let mut od = self.opencl_data.write().unwrap();
*od = None;
}
}
/// Sends a tensor from the GPU to the CPU. This is a no-op if the tensor is already on the
/// CPU.
#[cfg(feature = "opencl")]
pub fn to_cpu(&mut self) -> Result<(), TensorError> {
self.process_waiting_for_data_mut();
let od = self.opencl_data.read().unwrap();
if od.is_none() {
return Ok(());
}
let data = unsafe { std::alloc::alloc(self.layout) };
if data.is_null() {
panic!("to_cpu: Failed to allocate tensor");
}
let ev = od.as_ref().unwrap().data_u16_from_gpu(data as *mut u16)?;
self.data = data as *mut u16 as *mut u8;
self.waiting_for_data = Some(ev);
Ok(()) Ok(())
} }
/// Make sure that the tensor has finished going to GPU. Used mostly for benchmarking. /// Make sure that the tensor has finished going to GPU. Used mostly for benchmarking.
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
pub fn wait_until_on_gpu(&mut self) { pub fn wait_until_on_gpu(&mut self) {
if self.opencl_data.is_none() { let mut od = self.opencl_data.write().unwrap();
if od.is_none() {
panic!("wait_until_on_gpu: Tensor is not on GPU"); panic!("wait_until_on_gpu: Tensor is not on GPU");
} }
self.opencl_data.as_mut().unwrap().wait_until_ready(); od.as_mut().unwrap().wait_until_ready();
} }
/// Naive implementation of to_f32, used for testing that the faster methods are correct. /// Naive implementation of to_f32, used for testing that the faster methods are correct.

@ -18,6 +18,12 @@ pub struct OpenCLTensor {
write_event: Option<ocl::Event>, // if Some, the buffer is being written to write_event: Option<ocl::Event>, // if Some, the buffer is being written to
data: *const u16, // if non-null, is host pointer that should be freed data: *const u16, // if non-null, is host pointer that should be freed
data_layout: Layout, data_layout: Layout,
nitems: usize,
}
#[derive(Debug)]
pub struct OpenCLEvent {
event: ocl::Event,
} }
impl Drop for OpenCLTensor { impl Drop for OpenCLTensor {
@ -76,6 +82,10 @@ impl OpenCL {
Ok(OpenCL { ctx, queue }) Ok(OpenCL { ctx, queue })
} }
pub fn flush(&self) {
let _ = self.queue.flush();
}
pub fn data_u16_to_gpu( pub fn data_u16_to_gpu(
&self, &self,
data: *const u16, data: *const u16,
@ -99,6 +109,7 @@ impl OpenCL {
write_event: Some(event), write_event: Some(event),
data, data,
data_layout, data_layout,
nitems,
}) })
} }
} }
@ -120,4 +131,25 @@ impl OpenCLTensor {
} }
} }
} }
pub fn data_u16_from_gpu(&self, data: *mut u16) -> Result<OpenCLEvent, OpenCLError> {
unsafe {
let mut event = Event::empty();
let data_slice: &mut [u16] = std::slice::from_raw_parts_mut(data, self.nitems);
self.buf
.cmd()
.read(data_slice)
.block(false)
.enew(&mut event)
.enq()?;
return Ok(OpenCLEvent { event });
}
}
}
impl OpenCLEvent {
#[inline]
pub fn wait(&self) {
self.event.wait_for().unwrap();
}
} }

Loading…
Cancel
Save