From f6217e00367a6e2845f2a5053f487a4c44352bae Mon Sep 17 00:00:00 2001 From: Mikko Juola Date: Sat, 11 Mar 2023 00:40:28 -0800 Subject: [PATCH] Add readme, make clippy happy. --- README.md | 36 +++++++++++++++++++++++------- src/rllama_main.rs | 6 ++--- src/tensor.rs | 17 ++++++++------- src/token_sampler.rs | 8 ++++++- src/tokenizer.rs | 2 +- src/transformer.rs | 52 ++++++++++++++++++++++---------------------- src/unpickler.rs | 16 +++++++------- 7 files changed, 82 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index c3e2f31..768401e 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,41 @@ # AdeonLLaMA This is my attempt at making the LLaMA language model working on a pure Rust -CPU implementation. +CPU implementation. I was inspired by an amazing CPU implementation here: +https://github.com/ggerganov/ggml that could run GPT-J 8B models. -As of writing of this, it can run LLaMA-7B at around ~1 token per second, using -something like 1.5 threads because I haven't yet properly figured out how to -multithread this. +As of writing of this, this can run LLaMA-7B at around ~1 token per second, +using something like 1.5 threads because I haven't yet properly figured out how +to multithread this. -It uses AVX2 intrinsics to speed up itself. +It uses AVX2 intrinsics to speed up itself. Therefore, you need an x86-family +CPU to run this. + +It has a Python unpickler that understands the `.pth` files used by PyTorch. +Well sort of, it doesn't unzip them automatically (see below). # How to run You will need the LLaMA-7B weights first. Refer to https://github.com/facebookresearch/llama/ -Once you have 7B weights, and the `tokenizer.model` it comes with, you can make -it generate tokens: +Once you have 7B weights, and the `tokenizer.model` it comes with, you need to +decompress it. + +```shell +$ cd LLaMA +$ cd 7B +$ unzip consolidated.00.pth +``` + +You should then be ready to generate some text. ```shell -cargo run --release -- --tokenizer-model /path/to/tokenizer.model --model-path /path/to/LLaMA/7B +cargo run --release -- --tokenizer-model /path/to/tokenizer.model --model-path /path/to/LLaMA/7B/consolidated/data.pkl --prompt "The meaning of life is" ``` + +Right now it seems to use around ~25 gigabytes of memory. Internally all +weights are cast to 32-bit floats. + +# Future plans + +This is a hobby thing for me so don't expect updates or help. diff --git a/src/rllama_main.rs b/src/rllama_main.rs index 06d9543..2ccc85b 100644 --- a/src/rllama_main.rs +++ b/src/rllama_main.rs @@ -40,8 +40,8 @@ pub fn main() -> Result<(), Box> { // We chop off file name from model_path and append "data/" let model_data_dir = model_path - .split("/") - .take(model_path.split("/").count() - 1) + .split('/') + .take(model_path.split('/').count() - 1) .collect::>() .join("/") + "/data/"; @@ -95,7 +95,7 @@ pub fn main() -> Result<(), Box> { continue; } let tok = tok.id_to_str(*tok_id); - tok_str = tok_str + tok.replace("▁", " ").as_str(); + tok_str += tok.replace('▁', " ").as_str(); } println!("{}", tok_str); } diff --git a/src/tensor.rs b/src/tensor.rs index 5e2106d..eabafe5 100644 --- a/src/tensor.rs +++ b/src/tensor.rs @@ -73,7 +73,7 @@ impl Clone for Tensor { impl Drop for Tensor { fn drop(&mut self) { unsafe { - if self.data != std::ptr::null_mut() { + if !self.data.is_null() { std::alloc::dealloc(self.data, self.layout); } } @@ -95,7 +95,7 @@ fn horizontal_sum(mut ymm: __m256) -> f32 { ymm = _mm256_add_ps(ymm, ymm2); ymm = _mm256_hadd_ps(ymm, ymm); ymm = _mm256_hadd_ps(ymm, ymm); - return _mm256_cvtss_f32(ymm); + _mm256_cvtss_f32(ymm) } } @@ -190,6 +190,7 @@ impl Tensor { } } + #[allow(clippy::missing_safety_doc)] pub unsafe fn uninitialized(rows: i64, cols: i64, dtype: TensorDType) -> Self { if rows == 0 || cols == 0 { let mut tensor = Self::empty(); @@ -203,7 +204,7 @@ impl Tensor { let layout = Layout::from_size_align((nitems as usize) * dtype.bytes_per_item(), 32).unwrap(); let data = unsafe { std::alloc::alloc(layout) }; - if data == std::ptr::null_mut() { + if data.is_null() { panic!("Failed to allocate tensor"); } // Even though we are uninitialized, we should zero out the extra space between the @@ -443,7 +444,7 @@ impl Tensor { } pub fn concat(pieces: &[&Tensor]) -> Tensor { - if pieces.len() == 0 { + if pieces.is_empty() { return Tensor::empty(); } let mut total_rows: i64 = 0; @@ -745,7 +746,7 @@ impl Tensor { unsafe { for row in 0..self_rows { - let row = row as usize; + let row = row; for col in 0..self_cols { let mut target8: __m256 = _mm256_setzero_ps(); for p in 0..src_cols_its { @@ -819,7 +820,7 @@ impl Tensor { for row in 0..self.rows { let mut sum8: __m256 = _mm256_setzero_ps(); for col in 0..col_its { - let col = (col * 8) as usize; + let col = col * 8; let left_side8 = _mm256_loadu_ps(self_data.add((row * self.capacity_cols) as usize + col)); let right_side8 = _mm256_loadu_ps(other_data.add(col)); @@ -885,7 +886,7 @@ impl Tensor { let layout = Layout::from_size_align((nitems as usize) * dtype.bytes_per_item(), 32).unwrap(); let data = unsafe { std::alloc::alloc_zeroed(layout) }; - if data == std::ptr::null_mut() { + if data.is_null() { panic!("Failed to allocate tensor"); } Self { @@ -1044,7 +1045,7 @@ impl TensorBuilder { unsafe { std::ptr::copy_nonoverlapping(buf.as_ptr(), tensor.data.add(cursor), buf.len()); } - cursor = cursor + (tensor.capacity_cols as usize * 2); + cursor += tensor.capacity_cols as usize * 2; } Ok(tensor.to_f32()) } diff --git a/src/token_sampler.rs b/src/token_sampler.rs index 36abc0f..21afe03 100644 --- a/src/token_sampler.rs +++ b/src/token_sampler.rs @@ -8,6 +8,12 @@ pub struct TokenSampler { top_k: usize, } +impl Default for TokenSampler { + fn default() -> Self { + Self::new() + } +} + impl TokenSampler { pub fn new() -> Self { Self { @@ -58,7 +64,7 @@ impl TokenSampler { logitsf.push((i as TokenId, logits.get_f32(0, i))); } logitsf.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); - logitsf.truncate(self.top_k as usize); + logitsf.truncate(self.top_k); let mut p_accum: f32 = 0.0; for (idx, v) in logitsf.iter().enumerate() { p_accum += v.1; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 5b0f647..3642a83 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -142,7 +142,7 @@ impl Tokenizer { pub fn tokenize_to_ids>(&self, s: S) -> Vec { let mut s: String = format!("▁{}", s.as_ref()); // Replace all space characters with a special token. - s = s.replace(" ", "▁"); + s = s.replace(' ', "▁"); let pieces = self.tokenize_to_pieces(s); let mut result = Vec::new(); diff --git a/src/transformer.rs b/src/transformer.rs index 06c69c0..5b56afa 100644 --- a/src/transformer.rs +++ b/src/transformer.rs @@ -85,6 +85,7 @@ pub struct FeedForward { } impl Transformer { + #[allow(clippy::too_many_arguments)] pub fn from_unpickled>( unpickled: &unpickler::Value, emb: Embedding, @@ -117,9 +118,8 @@ impl Transformer { .collect::, UnpicklingError>>()?; std::mem::drop(progress_bar); - let norm = RMSNorm::from_unpickled(unpickled, format!("norm.weight"), eps, data_dir)?; - let output = - Tensor::from_unpickled(unpickled, format!("output.weight"), data_dir)?.to_f32(); + let norm = RMSNorm::from_unpickled(unpickled, "norm.weight".to_string(), eps, data_dir)?; + let output = Tensor::from_unpickled(unpickled, "output.weight", data_dir)?.to_f32(); Ok(Transformer { freqs_cis: compute_freqs_cis(dim / n_heads, max_seq_len * 2, 10000.0), @@ -189,8 +189,8 @@ impl Transformer { } let out = self.norm.forward(&emb_tensor); let out = out.row(out.rows() - 1); - let prediction = self.output.matrix_mul_transposed(&out); - return prediction; + + self.output.matrix_mul_transposed(&out) } } @@ -242,8 +242,8 @@ impl TransformerBlock { let h = x.add(&att_out); let att_out = self.ffn_norm.forward(&h); let att_out = self.feed_forward.forward(&att_out.transpose()).transpose(); - let att_out = h.add(&att_out); - return att_out; + + h.add(&att_out) } } @@ -255,7 +255,7 @@ impl RMSNorm { data_dir: P, ) -> Result { let data_dir: &Path = data_dir.as_ref(); - let weights = Tensor::from_unpickled(unpickled, &name, data_dir)?.to_f32(); + let weights = Tensor::from_unpickled(unpickled, name, data_dir)?.to_f32(); Ok(Self { eps, weight: weights, @@ -265,7 +265,7 @@ impl RMSNorm { fn forward(&self, x: &Tensor) -> Tensor { let inner = x.pow(2.0).mean_cols().add_scalar(self.eps as f32); let out1 = x.scalar_multiply_broadcast(&inner.rsqrt()); - return out1.hadamard_product_broadcast(&self.weight); + out1.hadamard_product_broadcast(&self.weight) } } @@ -307,8 +307,8 @@ impl FeedForward { ); let w1_out = w1_out.silu(); let w1w3_out = w1_out.hadamard_product(&w3_out).transpose(); - let out = self.w2.matrix_mul_transposed(&w1w3_out); - return out; + + self.w2.matrix_mul_transposed(&w1w3_out) } } @@ -417,8 +417,8 @@ impl Attention { let concat_vec2: Vec<&Tensor> = concat_vec.iter().collect(); let xv_row = Tensor::concat(&concat_vec2); - let mut cache_k = attention_cache.cache_k[idx as usize].write().unwrap(); - let mut cache_v = attention_cache.cache_v[idx as usize].write().unwrap(); + let mut cache_k = attention_cache.cache_k[idx].write().unwrap(); + let mut cache_v = attention_cache.cache_v[idx].write().unwrap(); /* let m = xq_row @@ -442,21 +442,21 @@ impl Attention { cache_v.set_f32(dim as i64, pos as i64, v); } } - let keys = cache_k.clip_cols((start_pos + seq_len as usize) as usize); - let values = cache_v.clip_cols((start_pos + seq_len as usize) as usize); + let keys = cache_k.clip_cols(start_pos + seq_len as usize); + let values = cache_v.clip_cols(start_pos + seq_len as usize); let m = xq_row .matrix_mul(&keys) .scalar_multiply_f32(1.0 / (self.head_dim as f32).sqrt()); - let m2 = match mask { + + match mask { Some(ref mask) => m .add(mask) .to_f32() .softmax() .matrix_mul_transposed(&values), None => m.softmax().matrix_mul_transposed(&values), - }; - m2 + } }) .collect(); @@ -466,18 +466,18 @@ impl Attention { .into_par_iter() .map(|idx| { let mut concat_vec: Vec = vec![]; - for idx2 in 0..self.n_local_heads { - concat_vec.push(output[idx2 as usize].row(idx as i64)); + for output in &output { + concat_vec.push(output.row(idx)); } let concat_vec2: Vec<&Tensor> = concat_vec.iter().collect(); let xq_row = Tensor::concat(&concat_vec2).view(1, 4096); - let xq_row = xq_row.matrix_mul_transposed(&self.wo); - xq_row + + xq_row.matrix_mul_transposed(&self.wo) }) .collect(); let output3: Vec<&Tensor> = output2.iter().collect(); let output2: Tensor = Tensor::concat(&output3); - return output2; + output2 } } @@ -513,7 +513,7 @@ fn apply_rotary_emb( xk_out.set_f32(row, col * 2 + 1, xk_imagpart); } } - return (xq_out, xk_out); + (xq_out, xk_out) } fn compute_freqs_cis(dim: usize, end: usize, theta: f64) -> FreqsCis { @@ -526,8 +526,8 @@ fn compute_freqs_cis(dim: usize, end: usize, theta: f64) -> FreqsCis { let mut result: Vec> = Vec::new(); for x in 0..end { let mut row = Vec::new(); - for y in 0..freqs.len() { - let freq = freqs[y] * (x as f64); + for freq in freqs.iter() { + let freq = freq * (x as f64); row.push(freq); } result.push(row); diff --git a/src/unpickler.rs b/src/unpickler.rs index f4744c2..df808b2 100644 --- a/src/unpickler.rs +++ b/src/unpickler.rs @@ -72,14 +72,14 @@ impl Value { pub fn get_persistent_id(&self) -> Option<&Value> { match self { - Value::PersistentId(v) => Some(&v), + Value::PersistentId(v) => Some(v), _ => None, } } pub fn get_tuple(&self) -> Option<&[Value]> { match self { - Value::Tuple(v) => Some(&v), + Value::Tuple(v) => Some(v), _ => None, } } @@ -92,7 +92,7 @@ impl Value { Value::Global(ref module_name, ref attribute_name) => { if module_name == "torch._utils" && attribute_name == "_rebuild_tensor_v2" { match **args { - Value::Tuple(ref args) => self.to_tensor_builder2(&args), + Value::Tuple(ref args) => self.to_tensor_builder2(args), _ => None, } } else { @@ -146,14 +146,14 @@ impl Value { return None; } - return Some(TensorBuilder { + Some(TensorBuilder { src_path: PathBuf::from(storage_filename), dtype, stride: row_stride, rows, cols, nitems, - }); + }) } fn to_tensor_builder2_6items(args: &[Value]) -> Option { @@ -203,14 +203,14 @@ impl Value { return None; } - return Some(TensorBuilder { + Some(TensorBuilder { src_path: PathBuf::from(storage_filename), dtype, stride: row_stride, rows, cols, nitems, - }); + }) /* Args should look like this (took random example from debug print) : 0 PERSISTENT_ID @@ -545,7 +545,7 @@ pub fn unpickle(bytes: &[u8]) -> Result { "Stack is empty while handling LONG_BINPUT".to_string(), )); } - memo.insert(key as u32, stack.last().unwrap().clone()); + memo.insert(key, stack.last().unwrap().clone()); bytes = &bytes[5..]; continue; }