Add readme, make clippy happy.

broken-opencl-code
Mikko Juola 3 years ago
parent 3b8f904f13
commit f6217e0036

@ -1,21 +1,41 @@
# AdeonLLaMA # AdeonLLaMA
This is my attempt at making the LLaMA language model working on a pure Rust This is my attempt at making the LLaMA language model working on a pure Rust
CPU implementation. CPU implementation. I was inspired by an amazing CPU implementation here:
https://github.com/ggerganov/ggml that could run GPT-J 8B models.
As of writing of this, it can run LLaMA-7B at around ~1 token per second, using As of writing of this, this can run LLaMA-7B at around ~1 token per second,
something like 1.5 threads because I haven't yet properly figured out how to using something like 1.5 threads because I haven't yet properly figured out how
multithread this. to multithread this.
It uses AVX2 intrinsics to speed up itself. It uses AVX2 intrinsics to speed up itself. Therefore, you need an x86-family
CPU to run this.
It has a Python unpickler that understands the `.pth` files used by PyTorch.
Well sort of, it doesn't unzip them automatically (see below).
# How to run # How to run
You will need the LLaMA-7B weights first. Refer to https://github.com/facebookresearch/llama/ You will need the LLaMA-7B weights first. Refer to https://github.com/facebookresearch/llama/
Once you have 7B weights, and the `tokenizer.model` it comes with, you can make Once you have 7B weights, and the `tokenizer.model` it comes with, you need to
it generate tokens: decompress it.
```shell
$ cd LLaMA
$ cd 7B
$ unzip consolidated.00.pth
```
You should then be ready to generate some text.
```shell ```shell
cargo run --release -- --tokenizer-model /path/to/tokenizer.model --model-path /path/to/LLaMA/7B cargo run --release -- --tokenizer-model /path/to/tokenizer.model --model-path /path/to/LLaMA/7B/consolidated/data.pkl --prompt "The meaning of life is"
``` ```
Right now it seems to use around ~25 gigabytes of memory. Internally all
weights are cast to 32-bit floats.
# Future plans
This is a hobby thing for me so don't expect updates or help.

@ -40,8 +40,8 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
// We chop off file name from model_path and append "data/" // We chop off file name from model_path and append "data/"
let model_data_dir = model_path let model_data_dir = model_path
.split("/") .split('/')
.take(model_path.split("/").count() - 1) .take(model_path.split('/').count() - 1)
.collect::<Vec<&str>>() .collect::<Vec<&str>>()
.join("/") .join("/")
+ "/data/"; + "/data/";
@ -95,7 +95,7 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
continue; continue;
} }
let tok = tok.id_to_str(*tok_id); let tok = tok.id_to_str(*tok_id);
tok_str = tok_str + tok.replace("▁", " ").as_str(); tok_str += tok.replace('▁', " ").as_str();
} }
println!("{}", tok_str); println!("{}", tok_str);
} }

@ -73,7 +73,7 @@ impl Clone for Tensor {
impl Drop for Tensor { impl Drop for Tensor {
fn drop(&mut self) { fn drop(&mut self) {
unsafe { unsafe {
if self.data != std::ptr::null_mut() { if !self.data.is_null() {
std::alloc::dealloc(self.data, self.layout); std::alloc::dealloc(self.data, self.layout);
} }
} }
@ -95,7 +95,7 @@ fn horizontal_sum(mut ymm: __m256) -> f32 {
ymm = _mm256_add_ps(ymm, ymm2); ymm = _mm256_add_ps(ymm, ymm2);
ymm = _mm256_hadd_ps(ymm, ymm); ymm = _mm256_hadd_ps(ymm, ymm);
ymm = _mm256_hadd_ps(ymm, ymm); ymm = _mm256_hadd_ps(ymm, ymm);
return _mm256_cvtss_f32(ymm); _mm256_cvtss_f32(ymm)
} }
} }
@ -190,6 +190,7 @@ impl Tensor {
} }
} }
#[allow(clippy::missing_safety_doc)]
pub unsafe fn uninitialized(rows: i64, cols: i64, dtype: TensorDType) -> Self { pub unsafe fn uninitialized(rows: i64, cols: i64, dtype: TensorDType) -> Self {
if rows == 0 || cols == 0 { if rows == 0 || cols == 0 {
let mut tensor = Self::empty(); let mut tensor = Self::empty();
@ -203,7 +204,7 @@ impl Tensor {
let layout = let layout =
Layout::from_size_align((nitems as usize) * dtype.bytes_per_item(), 32).unwrap(); Layout::from_size_align((nitems as usize) * dtype.bytes_per_item(), 32).unwrap();
let data = unsafe { std::alloc::alloc(layout) }; let data = unsafe { std::alloc::alloc(layout) };
if data == std::ptr::null_mut() { if data.is_null() {
panic!("Failed to allocate tensor"); panic!("Failed to allocate tensor");
} }
// Even though we are uninitialized, we should zero out the extra space between the // Even though we are uninitialized, we should zero out the extra space between the
@ -443,7 +444,7 @@ impl Tensor {
} }
pub fn concat(pieces: &[&Tensor]) -> Tensor { pub fn concat(pieces: &[&Tensor]) -> Tensor {
if pieces.len() == 0 { if pieces.is_empty() {
return Tensor::empty(); return Tensor::empty();
} }
let mut total_rows: i64 = 0; let mut total_rows: i64 = 0;
@ -745,7 +746,7 @@ impl Tensor {
unsafe { unsafe {
for row in 0..self_rows { for row in 0..self_rows {
let row = row as usize; let row = row;
for col in 0..self_cols { for col in 0..self_cols {
let mut target8: __m256 = _mm256_setzero_ps(); let mut target8: __m256 = _mm256_setzero_ps();
for p in 0..src_cols_its { for p in 0..src_cols_its {
@ -819,7 +820,7 @@ impl Tensor {
for row in 0..self.rows { for row in 0..self.rows {
let mut sum8: __m256 = _mm256_setzero_ps(); let mut sum8: __m256 = _mm256_setzero_ps();
for col in 0..col_its { for col in 0..col_its {
let col = (col * 8) as usize; let col = col * 8;
let left_side8 = let left_side8 =
_mm256_loadu_ps(self_data.add((row * self.capacity_cols) as usize + col)); _mm256_loadu_ps(self_data.add((row * self.capacity_cols) as usize + col));
let right_side8 = _mm256_loadu_ps(other_data.add(col)); let right_side8 = _mm256_loadu_ps(other_data.add(col));
@ -885,7 +886,7 @@ impl Tensor {
let layout = let layout =
Layout::from_size_align((nitems as usize) * dtype.bytes_per_item(), 32).unwrap(); Layout::from_size_align((nitems as usize) * dtype.bytes_per_item(), 32).unwrap();
let data = unsafe { std::alloc::alloc_zeroed(layout) }; let data = unsafe { std::alloc::alloc_zeroed(layout) };
if data == std::ptr::null_mut() { if data.is_null() {
panic!("Failed to allocate tensor"); panic!("Failed to allocate tensor");
} }
Self { Self {
@ -1044,7 +1045,7 @@ impl TensorBuilder {
unsafe { unsafe {
std::ptr::copy_nonoverlapping(buf.as_ptr(), tensor.data.add(cursor), buf.len()); std::ptr::copy_nonoverlapping(buf.as_ptr(), tensor.data.add(cursor), buf.len());
} }
cursor = cursor + (tensor.capacity_cols as usize * 2); cursor += tensor.capacity_cols as usize * 2;
} }
Ok(tensor.to_f32()) Ok(tensor.to_f32())
} }

@ -8,6 +8,12 @@ pub struct TokenSampler {
top_k: usize, top_k: usize,
} }
impl Default for TokenSampler {
fn default() -> Self {
Self::new()
}
}
impl TokenSampler { impl TokenSampler {
pub fn new() -> Self { pub fn new() -> Self {
Self { Self {
@ -58,7 +64,7 @@ impl TokenSampler {
logitsf.push((i as TokenId, logits.get_f32(0, i))); logitsf.push((i as TokenId, logits.get_f32(0, i)));
} }
logitsf.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); logitsf.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
logitsf.truncate(self.top_k as usize); logitsf.truncate(self.top_k);
let mut p_accum: f32 = 0.0; let mut p_accum: f32 = 0.0;
for (idx, v) in logitsf.iter().enumerate() { for (idx, v) in logitsf.iter().enumerate() {
p_accum += v.1; p_accum += v.1;

@ -142,7 +142,7 @@ impl Tokenizer {
pub fn tokenize_to_ids<S: AsRef<str>>(&self, s: S) -> Vec<TokenId> { pub fn tokenize_to_ids<S: AsRef<str>>(&self, s: S) -> Vec<TokenId> {
let mut s: String = format!("▁{}", s.as_ref()); let mut s: String = format!("▁{}", s.as_ref());
// Replace all space characters with a special token. // Replace all space characters with a special token.
s = s.replace(" ", "▁"); s = s.replace(' ', "▁");
let pieces = self.tokenize_to_pieces(s); let pieces = self.tokenize_to_pieces(s);
let mut result = Vec::new(); let mut result = Vec::new();

@ -85,6 +85,7 @@ pub struct FeedForward {
} }
impl Transformer { impl Transformer {
#[allow(clippy::too_many_arguments)]
pub fn from_unpickled<P: AsRef<Path>>( pub fn from_unpickled<P: AsRef<Path>>(
unpickled: &unpickler::Value, unpickled: &unpickler::Value,
emb: Embedding, emb: Embedding,
@ -117,9 +118,8 @@ impl Transformer {
.collect::<Result<Vec<TransformerBlock>, UnpicklingError>>()?; .collect::<Result<Vec<TransformerBlock>, UnpicklingError>>()?;
std::mem::drop(progress_bar); std::mem::drop(progress_bar);
let norm = RMSNorm::from_unpickled(unpickled, format!("norm.weight"), eps, data_dir)?; let norm = RMSNorm::from_unpickled(unpickled, "norm.weight".to_string(), eps, data_dir)?;
let output = let output = Tensor::from_unpickled(unpickled, "output.weight", data_dir)?.to_f32();
Tensor::from_unpickled(unpickled, format!("output.weight"), data_dir)?.to_f32();
Ok(Transformer { Ok(Transformer {
freqs_cis: compute_freqs_cis(dim / n_heads, max_seq_len * 2, 10000.0), freqs_cis: compute_freqs_cis(dim / n_heads, max_seq_len * 2, 10000.0),
@ -189,8 +189,8 @@ impl Transformer {
} }
let out = self.norm.forward(&emb_tensor); let out = self.norm.forward(&emb_tensor);
let out = out.row(out.rows() - 1); let out = out.row(out.rows() - 1);
let prediction = self.output.matrix_mul_transposed(&out);
return prediction; self.output.matrix_mul_transposed(&out)
} }
} }
@ -242,8 +242,8 @@ impl TransformerBlock {
let h = x.add(&att_out); let h = x.add(&att_out);
let att_out = self.ffn_norm.forward(&h); let att_out = self.ffn_norm.forward(&h);
let att_out = self.feed_forward.forward(&att_out.transpose()).transpose(); let att_out = self.feed_forward.forward(&att_out.transpose()).transpose();
let att_out = h.add(&att_out);
return att_out; h.add(&att_out)
} }
} }
@ -255,7 +255,7 @@ impl RMSNorm {
data_dir: P, data_dir: P,
) -> Result<RMSNorm, UnpicklingError> { ) -> Result<RMSNorm, UnpicklingError> {
let data_dir: &Path = data_dir.as_ref(); let data_dir: &Path = data_dir.as_ref();
let weights = Tensor::from_unpickled(unpickled, &name, data_dir)?.to_f32(); let weights = Tensor::from_unpickled(unpickled, name, data_dir)?.to_f32();
Ok(Self { Ok(Self {
eps, eps,
weight: weights, weight: weights,
@ -265,7 +265,7 @@ impl RMSNorm {
fn forward(&self, x: &Tensor) -> Tensor { fn forward(&self, x: &Tensor) -> Tensor {
let inner = x.pow(2.0).mean_cols().add_scalar(self.eps as f32); let inner = x.pow(2.0).mean_cols().add_scalar(self.eps as f32);
let out1 = x.scalar_multiply_broadcast(&inner.rsqrt()); let out1 = x.scalar_multiply_broadcast(&inner.rsqrt());
return out1.hadamard_product_broadcast(&self.weight); out1.hadamard_product_broadcast(&self.weight)
} }
} }
@ -307,8 +307,8 @@ impl FeedForward {
); );
let w1_out = w1_out.silu(); let w1_out = w1_out.silu();
let w1w3_out = w1_out.hadamard_product(&w3_out).transpose(); let w1w3_out = w1_out.hadamard_product(&w3_out).transpose();
let out = self.w2.matrix_mul_transposed(&w1w3_out);
return out; self.w2.matrix_mul_transposed(&w1w3_out)
} }
} }
@ -417,8 +417,8 @@ impl Attention {
let concat_vec2: Vec<&Tensor> = concat_vec.iter().collect(); let concat_vec2: Vec<&Tensor> = concat_vec.iter().collect();
let xv_row = Tensor::concat(&concat_vec2); let xv_row = Tensor::concat(&concat_vec2);
let mut cache_k = attention_cache.cache_k[idx as usize].write().unwrap(); let mut cache_k = attention_cache.cache_k[idx].write().unwrap();
let mut cache_v = attention_cache.cache_v[idx as usize].write().unwrap(); let mut cache_v = attention_cache.cache_v[idx].write().unwrap();
/* /*
let m = xq_row let m = xq_row
@ -442,21 +442,21 @@ impl Attention {
cache_v.set_f32(dim as i64, pos as i64, v); cache_v.set_f32(dim as i64, pos as i64, v);
} }
} }
let keys = cache_k.clip_cols((start_pos + seq_len as usize) as usize); let keys = cache_k.clip_cols(start_pos + seq_len as usize);
let values = cache_v.clip_cols((start_pos + seq_len as usize) as usize); let values = cache_v.clip_cols(start_pos + seq_len as usize);
let m = xq_row let m = xq_row
.matrix_mul(&keys) .matrix_mul(&keys)
.scalar_multiply_f32(1.0 / (self.head_dim as f32).sqrt()); .scalar_multiply_f32(1.0 / (self.head_dim as f32).sqrt());
let m2 = match mask {
match mask {
Some(ref mask) => m Some(ref mask) => m
.add(mask) .add(mask)
.to_f32() .to_f32()
.softmax() .softmax()
.matrix_mul_transposed(&values), .matrix_mul_transposed(&values),
None => m.softmax().matrix_mul_transposed(&values), None => m.softmax().matrix_mul_transposed(&values),
}; }
m2
}) })
.collect(); .collect();
@ -466,18 +466,18 @@ impl Attention {
.into_par_iter() .into_par_iter()
.map(|idx| { .map(|idx| {
let mut concat_vec: Vec<Tensor> = vec![]; let mut concat_vec: Vec<Tensor> = vec![];
for idx2 in 0..self.n_local_heads { for output in &output {
concat_vec.push(output[idx2 as usize].row(idx as i64)); concat_vec.push(output.row(idx));
} }
let concat_vec2: Vec<&Tensor> = concat_vec.iter().collect(); let concat_vec2: Vec<&Tensor> = concat_vec.iter().collect();
let xq_row = Tensor::concat(&concat_vec2).view(1, 4096); let xq_row = Tensor::concat(&concat_vec2).view(1, 4096);
let xq_row = xq_row.matrix_mul_transposed(&self.wo);
xq_row xq_row.matrix_mul_transposed(&self.wo)
}) })
.collect(); .collect();
let output3: Vec<&Tensor> = output2.iter().collect(); let output3: Vec<&Tensor> = output2.iter().collect();
let output2: Tensor = Tensor::concat(&output3); let output2: Tensor = Tensor::concat(&output3);
return output2; output2
} }
} }
@ -513,7 +513,7 @@ fn apply_rotary_emb(
xk_out.set_f32(row, col * 2 + 1, xk_imagpart); xk_out.set_f32(row, col * 2 + 1, xk_imagpart);
} }
} }
return (xq_out, xk_out); (xq_out, xk_out)
} }
fn compute_freqs_cis(dim: usize, end: usize, theta: f64) -> FreqsCis { fn compute_freqs_cis(dim: usize, end: usize, theta: f64) -> FreqsCis {
@ -526,8 +526,8 @@ fn compute_freqs_cis(dim: usize, end: usize, theta: f64) -> FreqsCis {
let mut result: Vec<Vec<f64>> = Vec::new(); let mut result: Vec<Vec<f64>> = Vec::new();
for x in 0..end { for x in 0..end {
let mut row = Vec::new(); let mut row = Vec::new();
for y in 0..freqs.len() { for freq in freqs.iter() {
let freq = freqs[y] * (x as f64); let freq = freq * (x as f64);
row.push(freq); row.push(freq);
} }
result.push(row); result.push(row);

@ -72,14 +72,14 @@ impl Value {
pub fn get_persistent_id(&self) -> Option<&Value> { pub fn get_persistent_id(&self) -> Option<&Value> {
match self { match self {
Value::PersistentId(v) => Some(&v), Value::PersistentId(v) => Some(v),
_ => None, _ => None,
} }
} }
pub fn get_tuple(&self) -> Option<&[Value]> { pub fn get_tuple(&self) -> Option<&[Value]> {
match self { match self {
Value::Tuple(v) => Some(&v), Value::Tuple(v) => Some(v),
_ => None, _ => None,
} }
} }
@ -92,7 +92,7 @@ impl Value {
Value::Global(ref module_name, ref attribute_name) => { Value::Global(ref module_name, ref attribute_name) => {
if module_name == "torch._utils" && attribute_name == "_rebuild_tensor_v2" { if module_name == "torch._utils" && attribute_name == "_rebuild_tensor_v2" {
match **args { match **args {
Value::Tuple(ref args) => self.to_tensor_builder2(&args), Value::Tuple(ref args) => self.to_tensor_builder2(args),
_ => None, _ => None,
} }
} else { } else {
@ -146,14 +146,14 @@ impl Value {
return None; return None;
} }
return Some(TensorBuilder { Some(TensorBuilder {
src_path: PathBuf::from(storage_filename), src_path: PathBuf::from(storage_filename),
dtype, dtype,
stride: row_stride, stride: row_stride,
rows, rows,
cols, cols,
nitems, nitems,
}); })
} }
fn to_tensor_builder2_6items(args: &[Value]) -> Option<TensorBuilder> { fn to_tensor_builder2_6items(args: &[Value]) -> Option<TensorBuilder> {
@ -203,14 +203,14 @@ impl Value {
return None; return None;
} }
return Some(TensorBuilder { Some(TensorBuilder {
src_path: PathBuf::from(storage_filename), src_path: PathBuf::from(storage_filename),
dtype, dtype,
stride: row_stride, stride: row_stride,
rows, rows,
cols, cols,
nitems, nitems,
}); })
/* Args should look like this (took random example from debug print) : /* Args should look like this (took random example from debug print) :
0 PERSISTENT_ID 0 PERSISTENT_ID
@ -545,7 +545,7 @@ pub fn unpickle(bytes: &[u8]) -> Result<Value, UnpicklingError> {
"Stack is empty while handling LONG_BINPUT".to_string(), "Stack is empty while handling LONG_BINPUT".to_string(),
)); ));
} }
memo.insert(key as u32, stack.last().unwrap().clone()); memo.insert(key, stack.last().unwrap().clone());
bytes = &bytes[5..]; bytes = &bytes[5..];
continue; continue;
} }

Loading…
Cancel
Save