Implement some attention operations for OpenCL.

3 years ago · 35b0c372a8
parent 6e456e64f3
commit 35b0c372a8
1 changed files with 101 additions and 25 deletions
--- a/src/transformer.rs
+++ b/src/transformer.rs
@ -36,6 +36,8 @@ pub struct DataSettings {
    #[cfg(feature = "opencl")]
    use_opencl_for_feedforward: bool,
    #[cfg(feature = "opencl")]
+    use_opencl_for_attention: bool,
+    #[cfg(feature = "opencl")]
    cl: Option<OpenCL>,
 }

@ -48,6 +50,7 @@ impl DataSettings {
    pub fn new(cl: Option<OpenCL>) -> Self {
        DataSettings {
            use_opencl_for_feedforward: false,
+            use_opencl_for_attention: false,
            cl: cl.clone(),
        }
    }
@ -63,6 +66,7 @@ impl DataSettings {
            panic!("OpenCL is not available, cannot call use_opencl() on DataSettings.");
        }
        self.use_opencl_for_feedforward = true;
+        self.use_opencl_for_attention = true;
        self
    }
 }
@ -142,6 +146,7 @@ pub struct Attention {
    wo: Tensor,
    n_local_heads: usize,
    head_dim: usize,
+    data_settings: DataSettings,
 }

 #[allow(dead_code)]
@ -285,9 +290,15 @@ impl TransformerBlock {
        data_dir: P,
    ) -> Result<Self, UnpicklingError> {
        let data_dir: &Path = data_dir.as_ref();
-        let ff = FeedForward::from_unpickled(unpickled, layer_id, data_dir, data_settings)?;
-        let attn =
-            Attention::from_unpickled(unpickled, layer_id, n_local_heads, head_dim, data_dir)?;
+        let ff = FeedForward::from_unpickled(unpickled, layer_id, data_dir, data_settings.clone())?;
+        let attn = Attention::from_unpickled(
+            unpickled,
+            layer_id,
+            n_local_heads,
+            head_dim,
+            data_settings,
+            data_dir,
+        )?;
        let ffn_norm = RMSNorm::from_unpickled(
            unpickled,
            format!("layers.{}.ffn_norm.weight", layer_id),
@ -316,10 +327,16 @@ impl TransformerBlock {
        mask: &Option<Tensor>,
        attention_cache: &mut AttentionCache,
    ) -> Tensor {
-        let attnorm_out = self.attention_norm.forward(x);
-        let att_out = self
-            .attn
-            .forward(&attnorm_out, start_pos, freqs_cis, mask, attention_cache);
+        let mut attnorm_out = self.attention_norm.forward(x);
+        let att_out = self.attn.forward(
+            &mut attnorm_out,
+            start_pos,
+            freqs_cis,
+            mask,
+            attention_cache,
+        );
+        std::mem::drop(attnorm_out);
+
        let h = x.add(&att_out);
        let mut att_out = self.ffn_norm.forward(&h);
        let att_out = self.feed_forward.forward(&mut att_out).transpose();
@ -416,9 +433,6 @@ impl FeedForward {
        #[cfg(feature = "opencl")]
        {
            x_was_on_cpu = x.is_on_cpu();
-        }
-        #[cfg(feature = "opencl")]
-        {
            if self.data_settings.use_opencl_for_feedforward {
                *x = x.to_f16();
                x.to_gpu(self.data_settings.cl.as_ref().unwrap()).unwrap();
@ -458,38 +472,57 @@ impl Attention {
        layer_id: usize,
        n_local_heads: usize,
        head_dim: usize,
+        data_settings: DataSettings,
        data_dir: P,
    ) -> Result<Attention, UnpicklingError> {
        let data_dir: &Path = data_dir.as_ref();

-        let wq = Tensor::from_unpickled_pieces(
+        let mut wq = Tensor::from_unpickled_pieces(
            unpickled,
            format!("layers.{}.attention.wq.weight", layer_id),
            data_dir,
            FromPiecesDirection::Rows,
-        )?
-        .to_f32();
-        let wk = Tensor::from_unpickled_pieces(
+        )?;
+        let mut wk = Tensor::from_unpickled_pieces(
            unpickled,
            format!("layers.{}.attention.wk.weight", layer_id),
            data_dir,
            FromPiecesDirection::Rows,
-        )?
-        .to_f32();
-        let wv = Tensor::from_unpickled_pieces(
+        )?;
+        let mut wv = Tensor::from_unpickled_pieces(
            unpickled,
            format!("layers.{}.attention.wv.weight", layer_id),
            data_dir,
            FromPiecesDirection::Rows,
-        )?
-        .to_f32();
-        let wo = Tensor::from_unpickled_pieces(
+        )?;
+        let mut wo = Tensor::from_unpickled_pieces(
            unpickled,
            format!("layers.{}.attention.wo.weight", layer_id),
            data_dir,
            FromPiecesDirection::Cols,
-        )?
-        .to_f32();
+        )?;
+
+        #[cfg(feature = "opencl")]
+        {
+            if data_settings.use_opencl_for_attention {
+                wq = wq.to_f16();
+                wk = wk.to_f16();
+                wv = wv.to_f16();
+                wo = wo.to_f16();
+                let ds = data_settings.clone();
+                wq.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
+                wk.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
+                wv.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
+                wo.to_gpu(&ds.cl.unwrap()).unwrap();
+            }
+        }
+        #[cfg(not(feature = "opencl"))]
+        {
+            wq = wq.to_f32();
+            wk = wk.to_f32();
+            wv = wv.to_f32();
+            wo = wo.to_f32();
+        }

        Ok(Self {
            wq,
@ -498,18 +531,42 @@ impl Attention {
            wo,
            n_local_heads,
            head_dim,
+            data_settings,
        })
    }

    fn forward(
        &self,
-        x: &Tensor,
+        x: &mut Tensor,
        start_pos: usize,
        freqs_cis: &FreqsCis,
        mask: &Option<Tensor>,
        attention_cache: &mut AttentionCache,
    ) -> Tensor {
+        #[cfg(feature = "opencl")]
+        let x_was_on_cpu: bool;
+        #[cfg(feature = "opencl")]
+        {
+            x_was_on_cpu = x.is_on_cpu();
+            if self.data_settings.use_opencl_for_attention {
+                *x = x.to_f16();
+                x.to_gpu(self.data_settings.cl.as_ref().unwrap()).unwrap();
+            }
+        }
+
        let seq_len = x.rows();
+        #[cfg(feature = "opencl")]
+        let (xq_out, xk_out, xv_out) = {
+            let mut xq_out = x.matrix_mul_transposed(&self.wq);
+            let mut xk_out = x.matrix_mul_transposed(&self.wk);
+            let mut xv_out = x.matrix_mul_transposed(&self.wv);
+            xq_out.to_cpu().unwrap();
+            xk_out.to_cpu().unwrap();
+            xv_out.to_cpu().unwrap();
+            (xq_out.to_f32(), xk_out.to_f32(), xv_out.to_f32())
+        };
+
+        #[cfg(not(feature = "opencl"))]
        let (xq_out, (xk_out, xv_out)) = rayon::join(
            || x.matrix_mul_transposed(&self.wq),
            || {
@ -604,8 +661,27 @@ impl Attention {
                    concat_vec.push(output.row(idx));
                }
                let concat_vec2: Vec<&Tensor> = concat_vec.iter().collect();
+                #[cfg(not(feature = "opencl"))]
+                {
                    let xq_row = Tensor::concat(&concat_vec2).view(1, self.wo.rows());
                    xq_row.matrix_mul_transposed(&self.wo)
+                }
+                #[cfg(feature = "opencl")]
+                {
+                    let mut xq_row = Tensor::concat(&concat_vec2)
+                        .view(1, self.wo.rows())
+                        .to_f16();
+                    if self.wo.is_on_gpu() {
+                        xq_row
+                            .to_gpu(&self.data_settings.cl.as_ref().unwrap())
+                            .unwrap();
+                        let mut result = xq_row.matrix_mul_transposed(&self.wo);
+                        result.to_cpu().unwrap();
+                        result.to_f32()
+                    } else {
+                        xq_row.matrix_mul_transposed(&self.wo)
+                    }
+                }
            })
            .collect();