diff --git a/src/benches/benchmark.rs b/src/benches/benchmark.rs
index 908fb46..b6a68eb 100644
--- a/src/benches/benchmark.rs
+++ b/src/benches/benchmark.rs
@@ -8,13 +8,32 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion};
 
 #[cfg(feature = "opencl")]
 pub fn opencl_benchmarks(c: &mut Criterion) {
-    let orig16 = Tensor::random(1024, 1024, TensorDType::Float16);
+    let mut orig1 = Tensor::random(1, 1, TensorDType::Float16);
+    let mut orig16 = Tensor::random(1024, 1024, TensorDType::Float16);
+    let mut orig32 = Tensor::random(4096, 4096, TensorDType::Float16);
     let cl = OpenCL::new(false, 0).unwrap();
 
-    c.bench_function("1024x1024 matrix from CPU to OpenCL device", |b| {
+    c.bench_function("1x1 matrix from CPU to OpenCL device and back", |b| {
         b.iter(|| {
-            let mut orig16 = orig16.clone();
-            let _ = orig16.to_gpu(&cl);
+            let _ = orig1.to_gpu(&cl).unwrap();
+            let _ = orig1.to_cpu();
+            orig1.process_waiting_for_data();
+        })
+    });
+
+    c.bench_function("1024x1024 matrix from CPU to OpenCL device and back", |b| {
+        b.iter(|| {
+            let _ = orig16.to_gpu(&cl).unwrap();
+            let _ = orig16.to_cpu();
+            orig16.process_waiting_for_data();
+        })
+    });
+
+    c.bench_function("4096x4096 matrix from CPU to OpenCL device and back", |b| {
+        b.iter(|| {
+            let _ = orig32.to_gpu(&cl).unwrap();
+            let _ = orig32.to_cpu();
+            orig32.process_waiting_for_data();
         })
     });
 }
diff --git a/src/tensor.rs b/src/tensor.rs
index 6857b2b..3d7d9e9 100644
--- a/src/tensor.rs
+++ b/src/tensor.rs
@@ -1,5 +1,5 @@
 #[cfg(feature = "opencl")]
-use crate::tensor_opencl_support::{OpenCL, OpenCLError, OpenCLTensor};
+use crate::tensor_opencl_support::{OpenCL, OpenCLError, OpenCLEvent, OpenCLTensor};
 use crate::unpickler;
 use crate::unpickler::UnpicklingError;
 use half::f16;
@@ -9,6 +9,8 @@ use std::alloc::Layout;
 use std::arch::x86_64::*;
 use std::io::{Read, Seek};
 use std::path::{Path, PathBuf};
+#[cfg(feature = "opencl")]
+use std::sync::{Arc, RwLock};
 use thiserror::Error;
 
 #[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
@@ -60,7 +62,9 @@ impl TensorDType {
 pub struct Tensor {
     data: *mut u8,
     #[cfg(feature = "opencl")]
-    opencl_data: Option<OpenCLTensor>,
+    opencl_data: Arc<RwLock<Option<OpenCLTensor>>>,
+    #[cfg(feature = "opencl")]
+    waiting_for_data: Option<OpenCLEvent>, // Is OpenCL in process of sending data back to CPU?
 
     dtype: TensorDType,
     layout: Layout,
@@ -76,6 +80,18 @@ unsafe impl Sync for Tensor {}
 
 impl Clone for Tensor {
     fn clone(&self) -> Self {
+        #[cfg(feature = "opencl")]
+        {
+            if let Some(ref wfd) = self.waiting_for_data {
+                wfd.wait();
+                let mut od = self.opencl_data.write().unwrap();
+                *od = None;
+            }
+            let od = self.opencl_data.read().unwrap();
+            if od.is_some() {
+                panic!("Tried to clone a tensor that is on the GPU");
+            }
+        }
         unsafe {
             let new_tensor = Tensor::uninitialized(self.rows, self.cols, self.dtype);
             std::ptr::copy_nonoverlapping(
@@ -90,6 +106,8 @@ impl Clone for Tensor {
 
 impl Drop for Tensor {
     fn drop(&mut self) {
+        #[cfg(feature = "opencl")]
+        self.process_waiting_for_data_mut();
         unsafe {
             if !self.data.is_null() {
                 std::alloc::dealloc(self.data, self.layout);
@@ -137,7 +155,9 @@ impl Tensor {
     pub fn assume_on_cpu(&self) {
         #[cfg(feature = "opencl")]
         {
-            if self.opencl_data.is_some() {
+            self.process_waiting_for_data();
+            let od = self.opencl_data.read().unwrap();
+            if od.is_some() {
                 panic!("Tried to assume_on_cpu on a tensor that is on the GPU");
             }
         }
@@ -252,7 +272,9 @@ impl Tensor {
         Self {
             data: std::ptr::null_mut(),
             #[cfg(feature = "opencl")]
-            opencl_data: None,
+            opencl_data: Arc::new(RwLock::new(None)),
+            #[cfg(feature = "opencl")]
+            waiting_for_data: None,
             dtype: TensorDType::Float16,
             layout: Layout::from_size_align(0, 0).unwrap(),
             rows: 0,
@@ -299,7 +321,9 @@ impl Tensor {
         Self {
             data,
             #[cfg(feature = "opencl")]
-            opencl_data: None,
+            opencl_data: Arc::new(RwLock::new(None)),
+            #[cfg(feature = "opencl")]
+            waiting_for_data: None,
             dtype,
             rows,
             cols,
@@ -1121,7 +1145,9 @@ impl Tensor {
         Self {
             data,
             #[cfg(feature = "opencl")]
-            opencl_data: None,
+            opencl_data: Arc::new(RwLock::new(None)),
+            #[cfg(feature = "opencl")]
+            waiting_for_data: None,
             dtype,
             rows,
             cols,
@@ -1213,17 +1239,18 @@ impl Tensor {
         }
     }
 
-    /// Sends a tensor to the GPU. This is a no-op if GPU support is not enabled, or if the tensor
-    /// is already on the GPU.
+    /// Sends a tensor to the GPU. This is a no-op if the tensor is already on the GPU.
     ///
     /// The tensor is moved asynchronously.
     #[cfg(feature = "opencl")]
     pub fn to_gpu(&mut self, cl: &OpenCL) -> Result<(), TensorError> {
-        if self.opencl_data.is_some() {
+        self.process_waiting_for_data_mut();
+        let mut od = self.opencl_data.write().unwrap();
+        if od.is_some() {
             return Ok(());
         }
         if self.dtype != TensorDType::Float16 {
-            panic!("Only float16 tensors are supported on the GPU");
+            panic!("to_gpu: Only float16 tensors are supported on the GPU");
         }
         let cl_tensor = cl.data_u16_to_gpu(
             self.data as *const u16,
@@ -1231,17 +1258,56 @@ impl Tensor {
             (self.rows * self.capacity_cols) as usize,
         )?;
         self.data = std::ptr::null_mut();
-        self.opencl_data = Some(cl_tensor);
+        *od = Some(cl_tensor);
+        Ok(())
+    }
+
+    #[cfg(feature = "opencl")]
+    pub fn process_waiting_for_data_mut(&mut self) {
+        if let Some(ref wfd) = self.waiting_for_data {
+            wfd.wait();
+            let mut od = self.opencl_data.write().unwrap();
+            *od = None;
+        }
+        self.waiting_for_data = None;
+    }
+
+    #[cfg(feature = "opencl")]
+    pub fn process_waiting_for_data(&self) {
+        if let Some(ref wfd) = self.waiting_for_data {
+            wfd.wait();
+            let mut od = self.opencl_data.write().unwrap();
+            *od = None;
+        }
+    }
+
+    /// Sends a tensor from the GPU to the CPU. This is a no-op if the tensor is already on the
+    /// CPU.
+    #[cfg(feature = "opencl")]
+    pub fn to_cpu(&mut self) -> Result<(), TensorError> {
+        self.process_waiting_for_data_mut();
+        let od = self.opencl_data.read().unwrap();
+        if od.is_none() {
+            return Ok(());
+        }
+        let data = unsafe { std::alloc::alloc(self.layout) };
+        if data.is_null() {
+            panic!("to_cpu: Failed to allocate tensor");
+        }
+        let ev = od.as_ref().unwrap().data_u16_from_gpu(data as *mut u16)?;
+        self.data = data as *mut u16 as *mut u8;
+        self.waiting_for_data = Some(ev);
         Ok(())
     }
 
     /// Make sure that the tensor has finished going to GPU. Used mostly for benchmarking.
     #[cfg(feature = "opencl")]
     pub fn wait_until_on_gpu(&mut self) {
-        if self.opencl_data.is_none() {
+        let mut od = self.opencl_data.write().unwrap();
+        if od.is_none() {
             panic!("wait_until_on_gpu: Tensor is not on GPU");
         }
-        self.opencl_data.as_mut().unwrap().wait_until_ready();
+        od.as_mut().unwrap().wait_until_ready();
     }
 
     /// Naive implementation of to_f32, used for testing that the faster methods are correct.
diff --git a/src/tensor_opencl_support.rs b/src/tensor_opencl_support.rs
index fddb502..25f66b1 100644
--- a/src/tensor_opencl_support.rs
+++ b/src/tensor_opencl_support.rs
@@ -18,6 +18,12 @@ pub struct OpenCLTensor {
     write_event: Option<ocl::Event>, // if Some, the buffer is being written to
     data: *const u16,                // if non-null, is host pointer that should be freed
     data_layout: Layout,
+    nitems: usize,
+}
+
+#[derive(Debug)]
+pub struct OpenCLEvent {
+    event: ocl::Event,
 }
 
 impl Drop for OpenCLTensor {
@@ -76,6 +82,10 @@ impl OpenCL {
         Ok(OpenCL { ctx, queue })
     }
 
+    pub fn flush(&self) {
+        let _ = self.queue.flush();
+    }
+
     pub fn data_u16_to_gpu(
         &self,
         data: *const u16,
@@ -99,6 +109,7 @@ impl OpenCL {
                 write_event: Some(event),
                 data,
                 data_layout,
+                nitems,
             })
         }
     }
@@ -120,4 +131,25 @@ impl OpenCLTensor {
             }
         }
     }
+
+    pub fn data_u16_from_gpu(&self, data: *mut u16) -> Result<OpenCLEvent, OpenCLError> {
+        unsafe {
+            let mut event = Event::empty();
+            let data_slice: &mut [u16] = std::slice::from_raw_parts_mut(data, self.nitems);
+            self.buf
+                .cmd()
+                .read(data_slice)
+                .block(false)
+                .enew(&mut event)
+                .enq()?;
+            return Ok(OpenCLEvent { event });
+        }
+    }
+}
+
+impl OpenCLEvent {
+    #[inline]
+    pub fn wait(&self) {
+        self.event.wait_for().unwrap();
+    }
 }