Implement some attention operations for OpenCL.

3 years ago · 35b0c372a8
parent 6e456e64f3
commit 35b0c372a8
1 changed files with 101 additions and 25 deletions
--- a/src/transformer.rs
+++ b/src/transformer.rs
@ -36,6 +36,8 @@ pub struct DataSettings {
    #[cfg(feature = "opencl")]
    use_opencl_for_feedforward: bool,
    #[cfg(feature = "opencl")]
    use_opencl_for_attention: bool,
    #[cfg(feature = "opencl")]
    cl: Option<OpenCL>,
 }
@ -48,6 +50,7 @@ impl DataSettings {
    pub fn new(cl: Option<OpenCL>) -> Self {
        DataSettings {
            use_opencl_for_feedforward: false,
            use_opencl_for_attention: false,
            cl: cl.clone(),
        }
    }
@ -63,6 +66,7 @@ impl DataSettings {
            panic!("OpenCL is not available, cannot call use_opencl() on DataSettings.");
        }
        self.use_opencl_for_feedforward = true;
        self.use_opencl_for_attention = true;
        self
    }
 }
@ -142,6 +146,7 @@ pub struct Attention {
    wo: Tensor,
    n_local_heads: usize,
    head_dim: usize,
    data_settings: DataSettings,
 }
 #[allow(dead_code)]
@ -285,9 +290,15 @@ impl TransformerBlock {
        data_dir: P,
    ) -> Result<Self, UnpicklingError> {
        let data_dir: &Path = data_dir.as_ref();
-        let ff = FeedForward::from_unpickled(unpickled, layer_id, data_dir, data_settings)?;
+        let ff = FeedForward::from_unpickled(unpickled, layer_id, data_dir, data_settings.clone())?;
-        let attn =
+        let attn = Attention::from_unpickled(
-            Attention::from_unpickled(unpickled, layer_id, n_local_heads, head_dim, data_dir)?;
+            unpickled,
            layer_id,
            n_local_heads,
            head_dim,
            data_settings,
            data_dir,
        )?;
        let ffn_norm = RMSNorm::from_unpickled(
            unpickled,
            format!("layers.{}.ffn_norm.weight", layer_id),
@ -316,10 +327,16 @@ impl TransformerBlock {
        mask: &Option<Tensor>,
        attention_cache: &mut AttentionCache,
    ) -> Tensor {
-        let attnorm_out = self.attention_norm.forward(x);
+        let mut attnorm_out = self.attention_norm.forward(x);
-        let att_out = self
+        let att_out = self.attn.forward(
-            .attn
+            &mut attnorm_out,
-            .forward(&attnorm_out, start_pos, freqs_cis, mask, attention_cache);
+            start_pos,
            freqs_cis,
            mask,
            attention_cache,
        );
        std::mem::drop(attnorm_out);
        let h = x.add(&att_out);
        let mut att_out = self.ffn_norm.forward(&h);
        let att_out = self.feed_forward.forward(&mut att_out).transpose();
@ -416,9 +433,6 @@ impl FeedForward {
        #[cfg(feature = "opencl")]
        {
            x_was_on_cpu = x.is_on_cpu();
        }
        #[cfg(feature = "opencl")]
        {
            if self.data_settings.use_opencl_for_feedforward {
                *x = x.to_f16();
                x.to_gpu(self.data_settings.cl.as_ref().unwrap()).unwrap();
@ -458,38 +472,57 @@ impl Attention {
        layer_id: usize,
        n_local_heads: usize,
        head_dim: usize,
        data_settings: DataSettings,
        data_dir: P,
    ) -> Result<Attention, UnpicklingError> {
        let data_dir: &Path = data_dir.as_ref();
-        let wq = Tensor::from_unpickled_pieces(
+        let mut wq = Tensor::from_unpickled_pieces(
            unpickled,
            format!("layers.{}.attention.wq.weight", layer_id),
            data_dir,
            FromPiecesDirection::Rows,
-        )?
+        )?;
-        .to_f32();
+        let mut wk = Tensor::from_unpickled_pieces(
        let wk = Tensor::from_unpickled_pieces(
            unpickled,
            format!("layers.{}.attention.wk.weight", layer_id),
            data_dir,
            FromPiecesDirection::Rows,
-        )?
+        )?;
-        .to_f32();
+        let mut wv = Tensor::from_unpickled_pieces(
        let wv = Tensor::from_unpickled_pieces(
            unpickled,
            format!("layers.{}.attention.wv.weight", layer_id),
            data_dir,
            FromPiecesDirection::Rows,
-        )?
+        )?;
-        .to_f32();
+        let mut wo = Tensor::from_unpickled_pieces(
        let wo = Tensor::from_unpickled_pieces(
            unpickled,
            format!("layers.{}.attention.wo.weight", layer_id),
            data_dir,
            FromPiecesDirection::Cols,
-        )?
+        )?;
-        .to_f32();
+
        #[cfg(feature = "opencl")]
        {
            if data_settings.use_opencl_for_attention {
                wq = wq.to_f16();
                wk = wk.to_f16();
                wv = wv.to_f16();
                wo = wo.to_f16();
                let ds = data_settings.clone();
                wq.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
                wk.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
                wv.to_gpu(&ds.cl.as_ref().unwrap().clone()).unwrap();
                wo.to_gpu(&ds.cl.unwrap()).unwrap();
            }
        }
        #[cfg(not(feature = "opencl"))]
        {
            wq = wq.to_f32();
            wk = wk.to_f32();
            wv = wv.to_f32();
            wo = wo.to_f32();
        }
        Ok(Self {
            wq,
@ -498,18 +531,42 @@ impl Attention {
            wo,
            n_local_heads,
            head_dim,
            data_settings,
        })
    }
    fn forward(
        &self,
-        x: &Tensor,
+        x: &mut Tensor,
        start_pos: usize,
        freqs_cis: &FreqsCis,
        mask: &Option<Tensor>,
        attention_cache: &mut AttentionCache,
    ) -> Tensor {
        #[cfg(feature = "opencl")]
        let x_was_on_cpu: bool;
        #[cfg(feature = "opencl")]
        {
            x_was_on_cpu = x.is_on_cpu();
            if self.data_settings.use_opencl_for_attention {
                *x = x.to_f16();
                x.to_gpu(self.data_settings.cl.as_ref().unwrap()).unwrap();
            }
        }
        let seq_len = x.rows();
        #[cfg(feature = "opencl")]
        let (xq_out, xk_out, xv_out) = {
            let mut xq_out = x.matrix_mul_transposed(&self.wq);
            let mut xk_out = x.matrix_mul_transposed(&self.wk);
            let mut xv_out = x.matrix_mul_transposed(&self.wv);
            xq_out.to_cpu().unwrap();
            xk_out.to_cpu().unwrap();
            xv_out.to_cpu().unwrap();
            (xq_out.to_f32(), xk_out.to_f32(), xv_out.to_f32())
        };
        #[cfg(not(feature = "opencl"))]
        let (xq_out, (xk_out, xv_out)) = rayon::join(
            || x.matrix_mul_transposed(&self.wq),
            || {
@ -604,8 +661,27 @@ impl Attention {
                    concat_vec.push(output.row(idx));
                }
                let concat_vec2: Vec<&Tensor> = concat_vec.iter().collect();
-                let xq_row = Tensor::concat(&concat_vec2).view(1, self.wo.rows());
+                #[cfg(not(feature = "opencl"))]
-                xq_row.matrix_mul_transposed(&self.wo)
+                {
                    let xq_row = Tensor::concat(&concat_vec2).view(1, self.wo.rows());
                    xq_row.matrix_mul_transposed(&self.wo)
                }
                #[cfg(feature = "opencl")]
                {
                    let mut xq_row = Tensor::concat(&concat_vec2)
                        .view(1, self.wo.rows())
                        .to_f16();
                    if self.wo.is_on_gpu() {
                        xq_row
                            .to_gpu(&self.data_settings.cl.as_ref().unwrap())
                            .unwrap();
                        let mut result = xq_row.matrix_mul_transposed(&self.wo);
                        result.to_cpu().unwrap();
                        result.to_f32()
                    } else {
                        xq_row.matrix_mul_transposed(&self.wo)
                    }
                }
            })
            .collect();