From f6217e00367a6e2845f2a5053f487a4c44352bae Mon Sep 17 00:00:00 2001
From: Mikko Juola <mikjuo@gmail.com>
Date: Sat, 11 Mar 2023 00:40:28 -0800
Subject: [PATCH] Add readme, make clippy happy.

---
 README.md            | 36 +++++++++++++++++++++++-------
 src/rllama_main.rs   |  6 ++---
 src/tensor.rs        | 17 ++++++++-------
 src/token_sampler.rs |  8 ++++++-
 src/tokenizer.rs     |  2 +-
 src/transformer.rs   | 52 ++++++++++++++++++++++----------------------
 src/unpickler.rs     | 16 +++++++-------
 7 files changed, 82 insertions(+), 55 deletions(-)
diff --git a/README.md b/README.md
index c3e2f31..768401e 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,41 @@
 # AdeonLLaMA
 
 This is my attempt at making the LLaMA language model working on a pure Rust
-CPU implementation.
+CPU implementation. I was inspired by an amazing CPU implementation here:
+https://github.com/ggerganov/ggml that could run GPT-J 8B models.
 
-As of writing of this, it can run LLaMA-7B at around ~1 token per second, using
-something like 1.5 threads because I haven't yet properly figured out how to
-multithread this.
+As of writing of this, this can run LLaMA-7B at around ~1 token per second,
+using something like 1.5 threads because I haven't yet properly figured out how
+to multithread this.
 
-It uses AVX2 intrinsics to speed up itself.
+It uses AVX2 intrinsics to speed up itself. Therefore, you need an x86-family
+CPU to run this.
+
+It has a Python unpickler that understands the `.pth` files used by PyTorch.
+Well sort of, it doesn't unzip them automatically (see below).
 
 # How to run
 
 You will need the LLaMA-7B weights first. Refer to https://github.com/facebookresearch/llama/
 
-Once you have 7B weights, and the `tokenizer.model` it comes with, you can make
-it generate tokens:
+Once you have 7B weights, and the `tokenizer.model` it comes with, you need to
+decompress it.
+
+```shell
+$ cd LLaMA
+$ cd 7B
+$ unzip consolidated.00.pth
+```
+
+You should then be ready to generate some text.
 
 ```shell
-cargo run --release -- --tokenizer-model /path/to/tokenizer.model --model-path /path/to/LLaMA/7B
+cargo run --release -- --tokenizer-model /path/to/tokenizer.model --model-path /path/to/LLaMA/7B/consolidated/data.pkl --prompt "The meaning of life is"
 ```
+
+Right now it seems to use around ~25 gigabytes of memory. Internally all
+weights are cast to 32-bit floats.
+
+# Future plans
+
+This is a hobby thing for me so don't expect updates or help.
diff --git a/src/rllama_main.rs b/src/rllama_main.rs
index 06d9543..2ccc85b 100644
--- a/src/rllama_main.rs
+++ b/src/rllama_main.rs
@@ -40,8 +40,8 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     // We chop off file name from model_path and append "data/"
     let model_data_dir = model_path
-        .split("/")
-        .take(model_path.split("/").count() - 1)
+        .split('/')
+        .take(model_path.split('/').count() - 1)
         .collect::<Vec<&str>>()
         .join("/")
         + "/data/";
@@ -95,7 +95,7 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
                 continue;
             }
             let tok = tok.id_to_str(*tok_id);
-            tok_str = tok_str + tok.replace("▁", " ").as_str();
+            tok_str += tok.replace('▁', " ").as_str();
         }
         println!("{}", tok_str);
     }
diff --git a/src/tensor.rs b/src/tensor.rs
index 5e2106d..eabafe5 100644
--- a/src/tensor.rs
+++ b/src/tensor.rs
@@ -73,7 +73,7 @@ impl Clone for Tensor {
 impl Drop for Tensor {
     fn drop(&mut self) {
         unsafe {
-            if self.data != std::ptr::null_mut() {
+            if !self.data.is_null() {
                 std::alloc::dealloc(self.data, self.layout);
             }
         }
@@ -95,7 +95,7 @@ fn horizontal_sum(mut ymm: __m256) -> f32 {
         ymm = _mm256_add_ps(ymm, ymm2);
         ymm = _mm256_hadd_ps(ymm, ymm);
         ymm = _mm256_hadd_ps(ymm, ymm);
-        return _mm256_cvtss_f32(ymm);
+        _mm256_cvtss_f32(ymm)
     }
 }
 
@@ -190,6 +190,7 @@ impl Tensor {
         }
     }
 
+    #[allow(clippy::missing_safety_doc)]
     pub unsafe fn uninitialized(rows: i64, cols: i64, dtype: TensorDType) -> Self {
         if rows == 0 || cols == 0 {
             let mut tensor = Self::empty();
@@ -203,7 +204,7 @@ impl Tensor {
         let layout =
             Layout::from_size_align((nitems as usize) * dtype.bytes_per_item(), 32).unwrap();
         let data = unsafe { std::alloc::alloc(layout) };
-        if data == std::ptr::null_mut() {
+        if data.is_null() {
             panic!("Failed to allocate tensor");
         }
         // Even though we are uninitialized, we should zero out the extra space between the
@@ -443,7 +444,7 @@ impl Tensor {
     }
 
     pub fn concat(pieces: &[&Tensor]) -> Tensor {
-        if pieces.len() == 0 {
+        if pieces.is_empty() {
             return Tensor::empty();
         }
         let mut total_rows: i64 = 0;
@@ -745,7 +746,7 @@ impl Tensor {
 
                 unsafe {
                     for row in 0..self_rows {
-                        let row = row as usize;
+                        let row = row;
                         for col in 0..self_cols {
                             let mut target8: __m256 = _mm256_setzero_ps();
                             for p in 0..src_cols_its {
@@ -819,7 +820,7 @@ impl Tensor {
             for row in 0..self.rows {
                 let mut sum8: __m256 = _mm256_setzero_ps();
                 for col in 0..col_its {
-                    let col = (col * 8) as usize;
+                    let col = col * 8;
                     let left_side8 =
                         _mm256_loadu_ps(self_data.add((row * self.capacity_cols) as usize + col));
                     let right_side8 = _mm256_loadu_ps(other_data.add(col));
@@ -885,7 +886,7 @@ impl Tensor {
         let layout =
             Layout::from_size_align((nitems as usize) * dtype.bytes_per_item(), 32).unwrap();
         let data = unsafe { std::alloc::alloc_zeroed(layout) };
-        if data == std::ptr::null_mut() {
+        if data.is_null() {
             panic!("Failed to allocate tensor");
         }
         Self {
@@ -1044,7 +1045,7 @@ impl TensorBuilder {
             unsafe {
                 std::ptr::copy_nonoverlapping(buf.as_ptr(), tensor.data.add(cursor), buf.len());
             }
-            cursor = cursor + (tensor.capacity_cols as usize * 2);
+            cursor += tensor.capacity_cols as usize * 2;
         }
         Ok(tensor.to_f32())
     }
diff --git a/src/token_sampler.rs b/src/token_sampler.rs
index 36abc0f..21afe03 100644
--- a/src/token_sampler.rs
+++ b/src/token_sampler.rs
@@ -8,6 +8,12 @@ pub struct TokenSampler {
     top_k: usize,
 }
 
+impl Default for TokenSampler {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl TokenSampler {
     pub fn new() -> Self {
         Self {
@@ -58,7 +64,7 @@ impl TokenSampler {
             logitsf.push((i as TokenId, logits.get_f32(0, i)));
         }
         logitsf.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
-        logitsf.truncate(self.top_k as usize);
+        logitsf.truncate(self.top_k);
         let mut p_accum: f32 = 0.0;
         for (idx, v) in logitsf.iter().enumerate() {
             p_accum += v.1;
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 5b0f647..3642a83 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -142,7 +142,7 @@ impl Tokenizer {
     pub fn tokenize_to_ids<S: AsRef<str>>(&self, s: S) -> Vec<TokenId> {
         let mut s: String = format!("▁{}", s.as_ref());
         // Replace all space characters with a special token.
-        s = s.replace(" ", "▁");
+        s = s.replace(' ', "▁");
 
         let pieces = self.tokenize_to_pieces(s);
         let mut result = Vec::new();
diff --git a/src/transformer.rs b/src/transformer.rs
index 06c69c0..5b56afa 100644
--- a/src/transformer.rs
+++ b/src/transformer.rs
@@ -85,6 +85,7 @@ pub struct FeedForward {
 }
 
 impl Transformer {
+    #[allow(clippy::too_many_arguments)]
     pub fn from_unpickled<P: AsRef<Path>>(
         unpickled: &unpickler::Value,
         emb: Embedding,
@@ -117,9 +118,8 @@ impl Transformer {
             .collect::<Result<Vec<TransformerBlock>, UnpicklingError>>()?;
         std::mem::drop(progress_bar);
 
-        let norm = RMSNorm::from_unpickled(unpickled, format!("norm.weight"), eps, data_dir)?;
-        let output =
-            Tensor::from_unpickled(unpickled, format!("output.weight"), data_dir)?.to_f32();
+        let norm = RMSNorm::from_unpickled(unpickled, "norm.weight".to_string(), eps, data_dir)?;
+        let output = Tensor::from_unpickled(unpickled, "output.weight", data_dir)?.to_f32();
 
         Ok(Transformer {
             freqs_cis: compute_freqs_cis(dim / n_heads, max_seq_len * 2, 10000.0),
@@ -189,8 +189,8 @@ impl Transformer {
         }
         let out = self.norm.forward(&emb_tensor);
         let out = out.row(out.rows() - 1);
-        let prediction = self.output.matrix_mul_transposed(&out);
-        return prediction;
+
+        self.output.matrix_mul_transposed(&out)
     }
 }
 
@@ -242,8 +242,8 @@ impl TransformerBlock {
         let h = x.add(&att_out);
         let att_out = self.ffn_norm.forward(&h);
         let att_out = self.feed_forward.forward(&att_out.transpose()).transpose();
-        let att_out = h.add(&att_out);
-        return att_out;
+
+        h.add(&att_out)
     }
 }
 
@@ -255,7 +255,7 @@ impl RMSNorm {
         data_dir: P,
     ) -> Result<RMSNorm, UnpicklingError> {
         let data_dir: &Path = data_dir.as_ref();
-        let weights = Tensor::from_unpickled(unpickled, &name, data_dir)?.to_f32();
+        let weights = Tensor::from_unpickled(unpickled, name, data_dir)?.to_f32();
         Ok(Self {
             eps,
             weight: weights,
@@ -265,7 +265,7 @@ impl RMSNorm {
     fn forward(&self, x: &Tensor) -> Tensor {
         let inner = x.pow(2.0).mean_cols().add_scalar(self.eps as f32);
         let out1 = x.scalar_multiply_broadcast(&inner.rsqrt());
-        return out1.hadamard_product_broadcast(&self.weight);
+        out1.hadamard_product_broadcast(&self.weight)
     }
 }
 
@@ -307,8 +307,8 @@ impl FeedForward {
         );
         let w1_out = w1_out.silu();
         let w1w3_out = w1_out.hadamard_product(&w3_out).transpose();
-        let out = self.w2.matrix_mul_transposed(&w1w3_out);
-        return out;
+
+        self.w2.matrix_mul_transposed(&w1w3_out)
     }
 }
 
@@ -417,8 +417,8 @@ impl Attention {
                 let concat_vec2: Vec<&Tensor> = concat_vec.iter().collect();
                 let xv_row = Tensor::concat(&concat_vec2);
 
-                let mut cache_k = attention_cache.cache_k[idx as usize].write().unwrap();
-                let mut cache_v = attention_cache.cache_v[idx as usize].write().unwrap();
+                let mut cache_k = attention_cache.cache_k[idx].write().unwrap();
+                let mut cache_v = attention_cache.cache_v[idx].write().unwrap();
 
                 /*
                 let m = xq_row
@@ -442,21 +442,21 @@ impl Attention {
                         cache_v.set_f32(dim as i64, pos as i64, v);
                     }
                 }
-                let keys = cache_k.clip_cols((start_pos + seq_len as usize) as usize);
-                let values = cache_v.clip_cols((start_pos + seq_len as usize) as usize);
+                let keys = cache_k.clip_cols(start_pos + seq_len as usize);
+                let values = cache_v.clip_cols(start_pos + seq_len as usize);
 
                 let m = xq_row
                     .matrix_mul(&keys)
                     .scalar_multiply_f32(1.0 / (self.head_dim as f32).sqrt());
-                let m2 = match mask {
+
+                match mask {
                     Some(ref mask) => m
                         .add(mask)
                         .to_f32()
                         .softmax()
                         .matrix_mul_transposed(&values),
                     None => m.softmax().matrix_mul_transposed(&values),
-                };
-                m2
+                }
             })
             .collect();
 
@@ -466,18 +466,18 @@ impl Attention {
             .into_par_iter()
             .map(|idx| {
                 let mut concat_vec: Vec<Tensor> = vec![];
-                for idx2 in 0..self.n_local_heads {
-                    concat_vec.push(output[idx2 as usize].row(idx as i64));
+                for output in &output {
+                    concat_vec.push(output.row(idx));
                 }
                 let concat_vec2: Vec<&Tensor> = concat_vec.iter().collect();
                 let xq_row = Tensor::concat(&concat_vec2).view(1, 4096);
-                let xq_row = xq_row.matrix_mul_transposed(&self.wo);
-                xq_row
+
+                xq_row.matrix_mul_transposed(&self.wo)
             })
             .collect();
         let output3: Vec<&Tensor> = output2.iter().collect();
         let output2: Tensor = Tensor::concat(&output3);
-        return output2;
+        output2
     }
 }
 
@@ -513,7 +513,7 @@ fn apply_rotary_emb(
             xk_out.set_f32(row, col * 2 + 1, xk_imagpart);
         }
     }
-    return (xq_out, xk_out);
+    (xq_out, xk_out)
 }
 
 fn compute_freqs_cis(dim: usize, end: usize, theta: f64) -> FreqsCis {
@@ -526,8 +526,8 @@ fn compute_freqs_cis(dim: usize, end: usize, theta: f64) -> FreqsCis {
     let mut result: Vec<Vec<f64>> = Vec::new();
     for x in 0..end {
         let mut row = Vec::new();
-        for y in 0..freqs.len() {
-            let freq = freqs[y] * (x as f64);
+        for freq in freqs.iter() {
+            let freq = freq * (x as f64);
             row.push(freq);
         }
         result.push(row);
diff --git a/src/unpickler.rs b/src/unpickler.rs
index f4744c2..df808b2 100644
--- a/src/unpickler.rs
+++ b/src/unpickler.rs
@@ -72,14 +72,14 @@ impl Value {
 
     pub fn get_persistent_id(&self) -> Option<&Value> {
         match self {
-            Value::PersistentId(v) => Some(&v),
+            Value::PersistentId(v) => Some(v),
             _ => None,
         }
     }
 
     pub fn get_tuple(&self) -> Option<&[Value]> {
         match self {
-            Value::Tuple(v) => Some(&v),
+            Value::Tuple(v) => Some(v),
             _ => None,
         }
     }
@@ -92,7 +92,7 @@ impl Value {
                 Value::Global(ref module_name, ref attribute_name) => {
                     if module_name == "torch._utils" && attribute_name == "_rebuild_tensor_v2" {
                         match **args {
-                            Value::Tuple(ref args) => self.to_tensor_builder2(&args),
+                            Value::Tuple(ref args) => self.to_tensor_builder2(args),
                             _ => None,
                         }
                     } else {
@@ -146,14 +146,14 @@ impl Value {
             return None;
         }
 
-        return Some(TensorBuilder {
+        Some(TensorBuilder {
             src_path: PathBuf::from(storage_filename),
             dtype,
             stride: row_stride,
             rows,
             cols,
             nitems,
-        });
+        })
     }
 
     fn to_tensor_builder2_6items(args: &[Value]) -> Option<TensorBuilder> {
@@ -203,14 +203,14 @@ impl Value {
             return None;
         }
 
-        return Some(TensorBuilder {
+        Some(TensorBuilder {
             src_path: PathBuf::from(storage_filename),
             dtype,
             stride: row_stride,
             rows,
             cols,
             nitems,
-        });
+        })
 
         /* Args should look like this (took random example from debug print) :
             0 PERSISTENT_ID
@@ -545,7 +545,7 @@ pub fn unpickle(bytes: &[u8]) -> Result<Value, UnpicklingError> {
                     "Stack is empty while handling LONG_BINPUT".to_string(),
                 ));
             }
-            memo.insert(key as u32, stack.last().unwrap().clone());
+            memo.insert(key, stack.last().unwrap().clone());
             bytes = &bytes[5..];
             continue;
         }