We can now run in (mostly) f16 mode without any OpenCL. It's not the fastest way but right now it looks like most memory friendly.

master
Mikko Juola 3 years ago
parent 1f5e687298
commit 8134c20d57

@ -49,9 +49,10 @@ You should then be ready to generate some text.
cargo run --release -- --tokenizer-model /path/to/tokenizer.model --model-path /path/to/LLaMA/7B --param-path /path/to/LLaMA/7B/params.json --prompt "The meaning of life is" cargo run --release -- --tokenizer-model /path/to/tokenizer.model --model-path /path/to/LLaMA/7B --param-path /path/to/LLaMA/7B/params.json --prompt "The meaning of life is"
``` ```
Right now it seems to use around ~25 gigabytes of memory for 7B and around ~50 By default, it will use the weights in the precision they are in the source
gigabytes for 13B. If you don't use OpenCL, then internally all parameters are files. You can use `--f16` command line argument to cast the largest weight
cast to 32-bit floats. matrices to float16. Also, using OpenCL will also cast the weight matrices to
float16.
You can use `--temperature`, `--top-p` and `--top-k` to adjust token sampler You can use `--temperature`, `--top-p` and `--top-k` to adjust token sampler
settings. settings.

@ -39,6 +39,9 @@ struct Cli {
#[arg(long)] #[arg(long)]
repetition_penalty: Option<f32>, repetition_penalty: Option<f32>,
#[arg(long, action)]
f16: bool,
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
#[arg(long)] #[arg(long)]
opencl_device: Option<usize>, opencl_device: Option<usize>,
@ -154,7 +157,7 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
let max_seq_len = cli.max_seq_len.unwrap_or(1024); let max_seq_len = cli.max_seq_len.unwrap_or(1024);
let data_settings = { let mut data_settings = {
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
{ {
if let Some(opencl) = opencl { if let Some(opencl) = opencl {
@ -168,6 +171,10 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
DataSettings::new() DataSettings::new()
}; };
if cli.f16 == true {
data_settings = data_settings.force_f16();
}
pln!("Loading transformer weights from {}...", model_path); pln!("Loading transformer weights from {}...", model_path);
let tr = Transformer::from_unpickled( let tr = Transformer::from_unpickled(
&unpickle_results, &unpickle_results,

@ -205,6 +205,11 @@ impl Tensor {
} }
} }
#[inline]
pub fn dtype(&self) -> TensorDType {
self.dtype
}
pub fn from_unpickled<P: AsRef<Path>, S: AsRef<str>>( pub fn from_unpickled<P: AsRef<Path>, S: AsRef<str>>(
unpickled: &unpickler::Value, unpickled: &unpickler::Value,
name: S, name: S,
@ -1014,11 +1019,44 @@ impl Tensor {
false false
} }
#[cfg(feature = "opencl")] #[cfg(not(feature = "opencl"))]
pub fn is_on_gpu(&self) -> bool {
false
}
pub fn is_on_cpu(&self) -> bool { pub fn is_on_cpu(&self) -> bool {
return !self.is_on_gpu(); return !self.is_on_gpu();
} }
// Casts data type to whatever the other tensors data type is.
pub fn to_same_type(&self, other: &Tensor) -> Tensor {
let mut result = self.clone();
if result.dtype() == other.dtype() {
return result;
}
match other.dtype {
TensorDType::Float32 => self.to_f32(),
TensorDType::Float16 => self.to_f16(),
}
}
pub fn into_same_type(self, other: &Tensor) -> Tensor {
if self.dtype() == other.dtype() {
return self;
}
match other.dtype {
TensorDType::Float32 => self.to_f32(),
TensorDType::Float16 => self.to_f16(),
}
}
pub fn into_dtype(self, dtype: TensorDType) -> Tensor {
match dtype {
TensorDType::Float32 => self.to_f32(),
TensorDType::Float16 => self.to_f16(),
}
}
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
fn matrix_mul_inplace_transposed_gpu(&mut self, src: &Tensor, other: &Tensor) { fn matrix_mul_inplace_transposed_gpu(&mut self, src: &Tensor, other: &Tensor) {
let mut self_od = self.opencl_data.write().unwrap(); let mut self_od = self.opencl_data.write().unwrap();

@ -39,6 +39,8 @@ pub struct DataSettings {
use_opencl_for_attention: bool, use_opencl_for_attention: bool,
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
cl: Option<OpenCL>, cl: Option<OpenCL>,
force_f16: bool,
} }
// OpenCL is safe to send to threads but Rust doesn't know that // OpenCL is safe to send to threads but Rust doesn't know that
@ -51,13 +53,14 @@ impl DataSettings {
DataSettings { DataSettings {
use_opencl_for_feedforward: false, use_opencl_for_feedforward: false,
use_opencl_for_attention: false, use_opencl_for_attention: false,
force_f16: false,
cl: cl.clone(), cl: cl.clone(),
} }
} }
#[cfg(not(feature = "opencl"))] #[cfg(not(feature = "opencl"))]
pub fn new() -> Self { pub fn new() -> Self {
DataSettings {} DataSettings { force_f16: false }
} }
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
@ -69,6 +72,11 @@ impl DataSettings {
self.use_opencl_for_attention = true; self.use_opencl_for_attention = true;
self self
} }
pub fn force_f16(mut self) -> DataSettings {
self.force_f16 = true;
self
}
} }
pub struct TransformerCaches { pub struct TransformerCaches {
@ -400,6 +408,12 @@ impl FeedForward {
FromPiecesDirection::Rows, FromPiecesDirection::Rows,
)?; )?;
if data_settings.force_f16 {
w1 = w1.to_f16();
w2 = w2.to_f16();
w3 = w3.to_f16();
}
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
{ {
if data_settings.use_opencl_for_feedforward { if data_settings.use_opencl_for_feedforward {
@ -412,12 +426,7 @@ impl FeedForward {
w3.to_gpu_inplace(&ds.cl.unwrap()).unwrap(); w3.to_gpu_inplace(&ds.cl.unwrap()).unwrap();
} }
} }
#[cfg(not(feature = "opencl"))] // w1, w2, w3 maybe be f32 or f16 depending on source data.
{
w1 = w1.to_f32();
w2 = w2.to_f32();
w3 = w3.to_f32();
}
Ok(Self { Ok(Self {
w1, w1,
@ -428,31 +437,40 @@ impl FeedForward {
} }
pub fn forward(&self, x: &mut Tensor) -> Tensor { pub fn forward(&self, x: &mut Tensor) -> Tensor {
let original_x_dtype = x.dtype();
if x.dtype() != self.w1.dtype() {
*x = x.to_same_type(&self.w1);
}
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
let x_was_on_cpu: bool; let x_was_on_cpu: bool;
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
{ {
x_was_on_cpu = x.is_on_cpu(); x_was_on_cpu = x.is_on_cpu();
if self.data_settings.use_opencl_for_feedforward { if self.data_settings.use_opencl_for_feedforward {
*x = x.to_f16();
x.to_gpu_inplace(self.data_settings.cl.as_ref().unwrap()) x.to_gpu_inplace(self.data_settings.cl.as_ref().unwrap())
.unwrap(); .unwrap();
} }
} }
let (w1_out, w3_out) = rayon::join( let (mut w1_out, mut w3_out) = rayon::join(
|| self.w1.matrix_mul_transposed(x), || self.w1.matrix_mul_transposed(x),
|| self.w3.matrix_mul_transposed(x), || self.w3.matrix_mul_transposed(x),
); );
let w1_out = w1_out.silu();
let w1w3_out = w1_out.hadamard_product(&w3_out).transpose();
// Float16 not supported for some of these ops on CPU.
if w1_out.is_on_cpu() && w1_out.dtype() == TensorDType::Float16 {
w1_out = w1_out.to_f32();
w3_out = w3_out.to_f32();
}
let w1_out = w1_out.silu();
let mut w1w3_out = w1_out.hadamard_product(&w3_out).transpose();
if w1w3_out.dtype() != self.w2.dtype() {
w1w3_out = w1w3_out.to_same_type(&self.w2);
}
#[cfg(not(feature = "opencl"))] #[cfg(not(feature = "opencl"))]
if w1w3_out.rows() == 1 { {
self self.w2
.w2 .matrix_mul_transposed(&w1w3_out)
.matrix_vector_mul_transposed_multithreaded(&w1w3_out) .into_dtype(original_x_dtype)
} else {
self.w2.matrix_mul_transposed(&w1w3_out)
} }
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
{ {
@ -503,6 +521,13 @@ impl Attention {
FromPiecesDirection::Cols, FromPiecesDirection::Cols,
)?; )?;
if data_settings.force_f16 {
wq = wq.to_f16();
wk = wk.to_f16();
wv = wv.to_f16();
wo = wo.to_f16();
}
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
{ {
if data_settings.use_opencl_for_attention { if data_settings.use_opencl_for_attention {
@ -517,13 +542,6 @@ impl Attention {
wo.to_gpu_inplace(&ds.cl.unwrap()).unwrap(); wo.to_gpu_inplace(&ds.cl.unwrap()).unwrap();
} }
} }
#[cfg(not(feature = "opencl"))]
{
wq = wq.to_f32();
wk = wk.to_f32();
wv = wv.to_f32();
wo = wo.to_f32();
}
Ok(Self { Ok(Self {
wq, wq,
@ -544,13 +562,17 @@ impl Attention {
mask: &Option<Tensor>, mask: &Option<Tensor>,
attention_cache: &mut AttentionCache, attention_cache: &mut AttentionCache,
) -> Tensor { ) -> Tensor {
let original_x_dtype = x.dtype();
if x.dtype() != self.wq.dtype() {
*x = x.to_same_type(&self.wq);
}
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
let x_was_on_cpu: bool; let x_was_on_cpu: bool;
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
{ {
x_was_on_cpu = x.is_on_cpu(); x_was_on_cpu = x.is_on_cpu();
if self.data_settings.use_opencl_for_attention { if self.data_settings.use_opencl_for_attention {
*x = x.to_f16();
x.to_gpu_inplace(self.data_settings.cl.as_ref().unwrap()) x.to_gpu_inplace(self.data_settings.cl.as_ref().unwrap())
.unwrap(); .unwrap();
} }
@ -570,11 +592,11 @@ impl Attention {
#[cfg(not(feature = "opencl"))] #[cfg(not(feature = "opencl"))]
let (xq_out, (xk_out, xv_out)) = rayon::join( let (xq_out, (xk_out, xv_out)) = rayon::join(
|| x.matrix_mul_transposed(&self.wq), || x.matrix_mul_transposed(&self.wq).to_f32(),
|| { || {
rayon::join( rayon::join(
|| x.matrix_mul_transposed(&self.wk), || x.matrix_mul_transposed(&self.wk).to_f32(),
|| x.matrix_mul_transposed(&self.wv), || x.matrix_mul_transposed(&self.wv).to_f32(),
) )
}, },
); );
@ -666,7 +688,9 @@ impl Attention {
#[cfg(not(feature = "opencl"))] #[cfg(not(feature = "opencl"))]
{ {
let xq_row = Tensor::concat(&concat_vec2).view(1, self.wo.rows()); let xq_row = Tensor::concat(&concat_vec2).view(1, self.wo.rows());
xq_row.matrix_mul_transposed(&self.wo) xq_row
.into_same_type(&self.wo)
.matrix_mul_transposed(&self.wo)
} }
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
{ {
@ -689,7 +713,7 @@ impl Attention {
let output3: Vec<&Tensor> = output2.iter().collect(); let output3: Vec<&Tensor> = output2.iter().collect();
let output2: Tensor = Tensor::concat(&output3); let output2: Tensor = Tensor::concat(&output3);
output2 output2.into_dtype(original_x_dtype)
} }
} }

Loading…
Cancel
Save