We can now run in (mostly) f16 mode without any OpenCL. It's not the fastest way but right now it looks like most memory friendly.

3 years ago · 8134c20d57
parent 1f5e687298
commit 8134c20d57
4 changed files with 105 additions and 35 deletions
--- a/README.md
+++ b/README.md
@ -49,9 +49,10 @@ You should then be ready to generate some text.
 cargo run --release -- --tokenizer-model /path/to/tokenizer.model --model-path /path/to/LLaMA/7B --param-path /path/to/LLaMA/7B/params.json --prompt "The meaning of life is"
 ```
-Right now it seems to use around ~25 gigabytes of memory for 7B and around ~50
+By default, it will use the weights in the precision they are in the source
-gigabytes for 13B. If you don't use OpenCL, then internally all parameters are
+files. You can use `--f16` command line argument to cast the largest weight
-cast to 32-bit floats.
+matrices to float16. Also, using OpenCL will also cast the weight matrices to
 float16.
 You can use `--temperature`, `--top-p` and `--top-k` to adjust token sampler
 settings.
--- a/src/rllama_main.rs
+++ b/src/rllama_main.rs
@ -39,6 +39,9 @@ struct Cli {
    #[arg(long)]
    repetition_penalty: Option<f32>,
    #[arg(long, action)]
    f16: bool,
    #[cfg(feature = "opencl")]
    #[arg(long)]
    opencl_device: Option<usize>,
@ -154,7 +157,7 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
    let max_seq_len = cli.max_seq_len.unwrap_or(1024);
-    let data_settings = {
+    let mut data_settings = {
        #[cfg(feature = "opencl")]
        {
            if let Some(opencl) = opencl {
@ -168,6 +171,10 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
        DataSettings::new()
    };
    if cli.f16 == true {
        data_settings = data_settings.force_f16();
    }
    pln!("Loading transformer weights from {}...", model_path);
    let tr = Transformer::from_unpickled(
        &unpickle_results,
--- a/src/tensor.rs
+++ b/src/tensor.rs
@ -205,6 +205,11 @@ impl Tensor {
        }
    }
    #[inline]
    pub fn dtype(&self) -> TensorDType {
        self.dtype
    }
    pub fn from_unpickled<P: AsRef<Path>, S: AsRef<str>>(
        unpickled: &unpickler::Value,
        name: S,
@ -1014,11 +1019,44 @@ impl Tensor {
        false
    }
-    #[cfg(feature = "opencl")]
+    #[cfg(not(feature = "opencl"))]
    pub fn is_on_gpu(&self) -> bool {
        false
    }
    pub fn is_on_cpu(&self) -> bool {
        return !self.is_on_gpu();
    }
    // Casts data type to whatever the other tensors data type is.
    pub fn to_same_type(&self, other: &Tensor) -> Tensor {
        let mut result = self.clone();
        if result.dtype() == other.dtype() {
            return result;
        }
        match other.dtype {
            TensorDType::Float32 => self.to_f32(),
            TensorDType::Float16 => self.to_f16(),
        }
    }
    pub fn into_same_type(self, other: &Tensor) -> Tensor {
        if self.dtype() == other.dtype() {
            return self;
        }
        match other.dtype {
            TensorDType::Float32 => self.to_f32(),
            TensorDType::Float16 => self.to_f16(),
        }
    }
    pub fn into_dtype(self, dtype: TensorDType) -> Tensor {
        match dtype {
            TensorDType::Float32 => self.to_f32(),
            TensorDType::Float16 => self.to_f16(),
        }
    }
    #[cfg(feature = "opencl")]
    fn matrix_mul_inplace_transposed_gpu(&mut self, src: &Tensor, other: &Tensor) {
        let mut self_od = self.opencl_data.write().unwrap();
--- a/src/transformer.rs
+++ b/src/transformer.rs
@ -39,6 +39,8 @@ pub struct DataSettings {
    use_opencl_for_attention: bool,
    #[cfg(feature = "opencl")]
    cl: Option<OpenCL>,
    force_f16: bool,
 }
 // OpenCL is safe to send to threads but Rust doesn't know that
@ -51,13 +53,14 @@ impl DataSettings {
        DataSettings {
            use_opencl_for_feedforward: false,
            use_opencl_for_attention: false,
            force_f16: false,
            cl: cl.clone(),
        }
    }
    #[cfg(not(feature = "opencl"))]
    pub fn new() -> Self {
-        DataSettings {}
+        DataSettings { force_f16: false }
    }
    #[cfg(feature = "opencl")]
@ -69,6 +72,11 @@ impl DataSettings {
        self.use_opencl_for_attention = true;
        self
    }
    pub fn force_f16(mut self) -> DataSettings {
        self.force_f16 = true;
        self
    }
 }
 pub struct TransformerCaches {
@ -400,6 +408,12 @@ impl FeedForward {
            FromPiecesDirection::Rows,
        )?;
        if data_settings.force_f16 {
            w1 = w1.to_f16();
            w2 = w2.to_f16();
            w3 = w3.to_f16();
        }
        #[cfg(feature = "opencl")]
        {
            if data_settings.use_opencl_for_feedforward {
@ -412,12 +426,7 @@ impl FeedForward {
                w3.to_gpu_inplace(&ds.cl.unwrap()).unwrap();
            }
        }
-        #[cfg(not(feature = "opencl"))]
+        // w1, w2, w3 maybe be f32 or f16 depending on source data.
        {
            w1 = w1.to_f32();
            w2 = w2.to_f32();
            w3 = w3.to_f32();
        }
        Ok(Self {
            w1,
@ -428,31 +437,40 @@ impl FeedForward {
    }
    pub fn forward(&self, x: &mut Tensor) -> Tensor {
        let original_x_dtype = x.dtype();
        if x.dtype() != self.w1.dtype() {
            *x = x.to_same_type(&self.w1);
        }
        #[cfg(feature = "opencl")]
        let x_was_on_cpu: bool;
        #[cfg(feature = "opencl")]
        {
            x_was_on_cpu = x.is_on_cpu();
            if self.data_settings.use_opencl_for_feedforward {
                *x = x.to_f16();
                x.to_gpu_inplace(self.data_settings.cl.as_ref().unwrap())
                    .unwrap();
            }
        }
-        let (w1_out, w3_out) = rayon::join(
+        let (mut w1_out, mut w3_out) = rayon::join(
            || self.w1.matrix_mul_transposed(x),
            || self.w3.matrix_mul_transposed(x),
        );
        let w1_out = w1_out.silu();
        let w1w3_out = w1_out.hadamard_product(&w3_out).transpose();
        // Float16 not supported for some of these ops on CPU.
        if w1_out.is_on_cpu() && w1_out.dtype() == TensorDType::Float16 {
            w1_out = w1_out.to_f32();
            w3_out = w3_out.to_f32();
        }
        let w1_out = w1_out.silu();
        let mut w1w3_out = w1_out.hadamard_product(&w3_out).transpose();
        if w1w3_out.dtype() != self.w2.dtype() {
            w1w3_out = w1w3_out.to_same_type(&self.w2);
        }
        #[cfg(not(feature = "opencl"))]
-        if w1w3_out.rows() == 1 {
+        {
-            self
+            self.w2
-                .w2
+                .matrix_mul_transposed(&w1w3_out)
-                .matrix_vector_mul_transposed_multithreaded(&w1w3_out)
+                .into_dtype(original_x_dtype)
        } else {
            self.w2.matrix_mul_transposed(&w1w3_out)
        }
        #[cfg(feature = "opencl")]
        {
@ -503,6 +521,13 @@ impl Attention {
            FromPiecesDirection::Cols,
        )?;
        if data_settings.force_f16 {
            wq = wq.to_f16();
            wk = wk.to_f16();
            wv = wv.to_f16();
            wo = wo.to_f16();
        }
        #[cfg(feature = "opencl")]
        {
            if data_settings.use_opencl_for_attention {
@ -517,13 +542,6 @@ impl Attention {
                wo.to_gpu_inplace(&ds.cl.unwrap()).unwrap();
            }
        }
        #[cfg(not(feature = "opencl"))]
        {
            wq = wq.to_f32();
            wk = wk.to_f32();
            wv = wv.to_f32();
            wo = wo.to_f32();
        }
        Ok(Self {
            wq,
@ -544,13 +562,17 @@ impl Attention {
        mask: &Option<Tensor>,
        attention_cache: &mut AttentionCache,
    ) -> Tensor {
        let original_x_dtype = x.dtype();
        if x.dtype() != self.wq.dtype() {
            *x = x.to_same_type(&self.wq);
        }
        #[cfg(feature = "opencl")]
        let x_was_on_cpu: bool;
        #[cfg(feature = "opencl")]
        {
            x_was_on_cpu = x.is_on_cpu();
            if self.data_settings.use_opencl_for_attention {
                *x = x.to_f16();
                x.to_gpu_inplace(self.data_settings.cl.as_ref().unwrap())
                    .unwrap();
            }
@ -570,11 +592,11 @@ impl Attention {
        #[cfg(not(feature = "opencl"))]
        let (xq_out, (xk_out, xv_out)) = rayon::join(
-            || x.matrix_mul_transposed(&self.wq),
+            || x.matrix_mul_transposed(&self.wq).to_f32(),
            || {
                rayon::join(
-                    || x.matrix_mul_transposed(&self.wk),
+                    || x.matrix_mul_transposed(&self.wk).to_f32(),
-                    || x.matrix_mul_transposed(&self.wv),
+                    || x.matrix_mul_transposed(&self.wv).to_f32(),
                )
            },
        );
@ -666,7 +688,9 @@ impl Attention {
                #[cfg(not(feature = "opencl"))]
                {
                    let xq_row = Tensor::concat(&concat_vec2).view(1, self.wo.rows());
-                    xq_row.matrix_mul_transposed(&self.wo)
+                    xq_row
                        .into_same_type(&self.wo)
                        .matrix_mul_transposed(&self.wo)
                }
                #[cfg(feature = "opencl")]
                {
@ -689,7 +713,7 @@ impl Attention {
        let output3: Vec<&Tensor> = output2.iter().collect();
        let output2: Tensor = Tensor::concat(&output3);
-        output2
+        output2.into_dtype(original_x_dtype)
    }
 }