Add readme, make clippy happy.

broken-opencl-code
Mikko Juola 3 years ago
parent 3b8f904f13
commit f6217e0036

@ -1,21 +1,41 @@
# AdeonLLaMA
This is my attempt at making the LLaMA language model working on a pure Rust
CPU implementation.
CPU implementation. I was inspired by an amazing CPU implementation here:
https://github.com/ggerganov/ggml that could run GPT-J 8B models.
As of writing of this, it can run LLaMA-7B at around ~1 token per second, using
something like 1.5 threads because I haven't yet properly figured out how to
multithread this.
As of writing of this, this can run LLaMA-7B at around ~1 token per second,
using something like 1.5 threads because I haven't yet properly figured out how
to multithread this.
It uses AVX2 intrinsics to speed up itself.
It uses AVX2 intrinsics to speed up itself. Therefore, you need an x86-family
CPU to run this.
It has a Python unpickler that understands the `.pth` files used by PyTorch.
Well sort of, it doesn't unzip them automatically (see below).
# How to run
You will need the LLaMA-7B weights first. Refer to https://github.com/facebookresearch/llama/
Once you have 7B weights, and the `tokenizer.model` it comes with, you can make
it generate tokens:
Once you have 7B weights, and the `tokenizer.model` it comes with, you need to
decompress it.
```shell
$ cd LLaMA
$ cd 7B
$ unzip consolidated.00.pth
```
You should then be ready to generate some text.
```shell
cargo run --release -- --tokenizer-model /path/to/tokenizer.model --model-path /path/to/LLaMA/7B
cargo run --release -- --tokenizer-model /path/to/tokenizer.model --model-path /path/to/LLaMA/7B/consolidated/data.pkl --prompt "The meaning of life is"
```
Right now it seems to use around ~25 gigabytes of memory. Internally all
weights are cast to 32-bit floats.
# Future plans
This is a hobby thing for me so don't expect updates or help.

@ -40,8 +40,8 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
// We chop off file name from model_path and append "data/"
let model_data_dir = model_path
.split("/")
.take(model_path.split("/").count() - 1)
.split('/')
.take(model_path.split('/').count() - 1)
.collect::<Vec<&str>>()
.join("/")
+ "/data/";
@ -95,7 +95,7 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
continue;
}
let tok = tok.id_to_str(*tok_id);
tok_str = tok_str + tok.replace("▁", " ").as_str();
tok_str += tok.replace('▁', " ").as_str();
}
println!("{}", tok_str);
}

@ -73,7 +73,7 @@ impl Clone for Tensor {
impl Drop for Tensor {
fn drop(&mut self) {
unsafe {
if self.data != std::ptr::null_mut() {
if !self.data.is_null() {
std::alloc::dealloc(self.data, self.layout);
}
}
@ -95,7 +95,7 @@ fn horizontal_sum(mut ymm: __m256) -> f32 {
ymm = _mm256_add_ps(ymm, ymm2);
ymm = _mm256_hadd_ps(ymm, ymm);
ymm = _mm256_hadd_ps(ymm, ymm);
return _mm256_cvtss_f32(ymm);
_mm256_cvtss_f32(ymm)
}
}
@ -190,6 +190,7 @@ impl Tensor {
}
}
#[allow(clippy::missing_safety_doc)]
pub unsafe fn uninitialized(rows: i64, cols: i64, dtype: TensorDType) -> Self {
if rows == 0 || cols == 0 {
let mut tensor = Self::empty();
@ -203,7 +204,7 @@ impl Tensor {
let layout =
Layout::from_size_align((nitems as usize) * dtype.bytes_per_item(), 32).unwrap();
let data = unsafe { std::alloc::alloc(layout) };
if data == std::ptr::null_mut() {
if data.is_null() {
panic!("Failed to allocate tensor");
}
// Even though we are uninitialized, we should zero out the extra space between the
@ -443,7 +444,7 @@ impl Tensor {
}
pub fn concat(pieces: &[&Tensor]) -> Tensor {
if pieces.len() == 0 {
if pieces.is_empty() {
return Tensor::empty();
}
let mut total_rows: i64 = 0;
@ -745,7 +746,7 @@ impl Tensor {
unsafe {
for row in 0..self_rows {
let row = row as usize;
let row = row;
for col in 0..self_cols {
let mut target8: __m256 = _mm256_setzero_ps();
for p in 0..src_cols_its {
@ -819,7 +820,7 @@ impl Tensor {
for row in 0..self.rows {
let mut sum8: __m256 = _mm256_setzero_ps();
for col in 0..col_its {
let col = (col * 8) as usize;
let col = col * 8;
let left_side8 =
_mm256_loadu_ps(self_data.add((row * self.capacity_cols) as usize + col));
let right_side8 = _mm256_loadu_ps(other_data.add(col));
@ -885,7 +886,7 @@ impl Tensor {
let layout =
Layout::from_size_align((nitems as usize) * dtype.bytes_per_item(), 32).unwrap();
let data = unsafe { std::alloc::alloc_zeroed(layout) };
if data == std::ptr::null_mut() {
if data.is_null() {
panic!("Failed to allocate tensor");
}
Self {
@ -1044,7 +1045,7 @@ impl TensorBuilder {
unsafe {
std::ptr::copy_nonoverlapping(buf.as_ptr(), tensor.data.add(cursor), buf.len());
}
cursor = cursor + (tensor.capacity_cols as usize * 2);
cursor += tensor.capacity_cols as usize * 2;
}
Ok(tensor.to_f32())
}

@ -8,6 +8,12 @@ pub struct TokenSampler {
top_k: usize,
}
impl Default for TokenSampler {
fn default() -> Self {
Self::new()
}
}
impl TokenSampler {
pub fn new() -> Self {
Self {
@ -58,7 +64,7 @@ impl TokenSampler {
logitsf.push((i as TokenId, logits.get_f32(0, i)));
}
logitsf.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
logitsf.truncate(self.top_k as usize);
logitsf.truncate(self.top_k);
let mut p_accum: f32 = 0.0;
for (idx, v) in logitsf.iter().enumerate() {
p_accum += v.1;

@ -142,7 +142,7 @@ impl Tokenizer {
pub fn tokenize_to_ids<S: AsRef<str>>(&self, s: S) -> Vec<TokenId> {
let mut s: String = format!("▁{}", s.as_ref());
// Replace all space characters with a special token.
s = s.replace(" ", "▁");
s = s.replace(' ', "▁");
let pieces = self.tokenize_to_pieces(s);
let mut result = Vec::new();

@ -85,6 +85,7 @@ pub struct FeedForward {
}
impl Transformer {
#[allow(clippy::too_many_arguments)]
pub fn from_unpickled<P: AsRef<Path>>(
unpickled: &unpickler::Value,
emb: Embedding,
@ -117,9 +118,8 @@ impl Transformer {
.collect::<Result<Vec<TransformerBlock>, UnpicklingError>>()?;
std::mem::drop(progress_bar);
let norm = RMSNorm::from_unpickled(unpickled, format!("norm.weight"), eps, data_dir)?;
let output =
Tensor::from_unpickled(unpickled, format!("output.weight"), data_dir)?.to_f32();
let norm = RMSNorm::from_unpickled(unpickled, "norm.weight".to_string(), eps, data_dir)?;
let output = Tensor::from_unpickled(unpickled, "output.weight", data_dir)?.to_f32();
Ok(Transformer {
freqs_cis: compute_freqs_cis(dim / n_heads, max_seq_len * 2, 10000.0),
@ -189,8 +189,8 @@ impl Transformer {
}
let out = self.norm.forward(&emb_tensor);
let out = out.row(out.rows() - 1);
let prediction = self.output.matrix_mul_transposed(&out);
return prediction;
self.output.matrix_mul_transposed(&out)
}
}
@ -242,8 +242,8 @@ impl TransformerBlock {
let h = x.add(&att_out);
let att_out = self.ffn_norm.forward(&h);
let att_out = self.feed_forward.forward(&att_out.transpose()).transpose();
let att_out = h.add(&att_out);
return att_out;
h.add(&att_out)
}
}
@ -255,7 +255,7 @@ impl RMSNorm {
data_dir: P,
) -> Result<RMSNorm, UnpicklingError> {
let data_dir: &Path = data_dir.as_ref();
let weights = Tensor::from_unpickled(unpickled, &name, data_dir)?.to_f32();
let weights = Tensor::from_unpickled(unpickled, name, data_dir)?.to_f32();
Ok(Self {
eps,
weight: weights,
@ -265,7 +265,7 @@ impl RMSNorm {
fn forward(&self, x: &Tensor) -> Tensor {
let inner = x.pow(2.0).mean_cols().add_scalar(self.eps as f32);
let out1 = x.scalar_multiply_broadcast(&inner.rsqrt());
return out1.hadamard_product_broadcast(&self.weight);
out1.hadamard_product_broadcast(&self.weight)
}
}
@ -307,8 +307,8 @@ impl FeedForward {
);
let w1_out = w1_out.silu();
let w1w3_out = w1_out.hadamard_product(&w3_out).transpose();
let out = self.w2.matrix_mul_transposed(&w1w3_out);
return out;
self.w2.matrix_mul_transposed(&w1w3_out)
}
}
@ -417,8 +417,8 @@ impl Attention {
let concat_vec2: Vec<&Tensor> = concat_vec.iter().collect();
let xv_row = Tensor::concat(&concat_vec2);
let mut cache_k = attention_cache.cache_k[idx as usize].write().unwrap();
let mut cache_v = attention_cache.cache_v[idx as usize].write().unwrap();
let mut cache_k = attention_cache.cache_k[idx].write().unwrap();
let mut cache_v = attention_cache.cache_v[idx].write().unwrap();
/*
let m = xq_row
@ -442,21 +442,21 @@ impl Attention {
cache_v.set_f32(dim as i64, pos as i64, v);
}
}
let keys = cache_k.clip_cols((start_pos + seq_len as usize) as usize);
let values = cache_v.clip_cols((start_pos + seq_len as usize) as usize);
let keys = cache_k.clip_cols(start_pos + seq_len as usize);
let values = cache_v.clip_cols(start_pos + seq_len as usize);
let m = xq_row
.matrix_mul(&keys)
.scalar_multiply_f32(1.0 / (self.head_dim as f32).sqrt());
let m2 = match mask {
match mask {
Some(ref mask) => m
.add(mask)
.to_f32()
.softmax()
.matrix_mul_transposed(&values),
None => m.softmax().matrix_mul_transposed(&values),
};
m2
}
})
.collect();
@ -466,18 +466,18 @@ impl Attention {
.into_par_iter()
.map(|idx| {
let mut concat_vec: Vec<Tensor> = vec![];
for idx2 in 0..self.n_local_heads {
concat_vec.push(output[idx2 as usize].row(idx as i64));
for output in &output {
concat_vec.push(output.row(idx));
}
let concat_vec2: Vec<&Tensor> = concat_vec.iter().collect();
let xq_row = Tensor::concat(&concat_vec2).view(1, 4096);
let xq_row = xq_row.matrix_mul_transposed(&self.wo);
xq_row
xq_row.matrix_mul_transposed(&self.wo)
})
.collect();
let output3: Vec<&Tensor> = output2.iter().collect();
let output2: Tensor = Tensor::concat(&output3);
return output2;
output2
}
}
@ -513,7 +513,7 @@ fn apply_rotary_emb(
xk_out.set_f32(row, col * 2 + 1, xk_imagpart);
}
}
return (xq_out, xk_out);
(xq_out, xk_out)
}
fn compute_freqs_cis(dim: usize, end: usize, theta: f64) -> FreqsCis {
@ -526,8 +526,8 @@ fn compute_freqs_cis(dim: usize, end: usize, theta: f64) -> FreqsCis {
let mut result: Vec<Vec<f64>> = Vec::new();
for x in 0..end {
let mut row = Vec::new();
for y in 0..freqs.len() {
let freq = freqs[y] * (x as f64);
for freq in freqs.iter() {
let freq = freq * (x as f64);
row.push(freq);
}
result.push(row);

@ -72,14 +72,14 @@ impl Value {
pub fn get_persistent_id(&self) -> Option<&Value> {
match self {
Value::PersistentId(v) => Some(&v),
Value::PersistentId(v) => Some(v),
_ => None,
}
}
pub fn get_tuple(&self) -> Option<&[Value]> {
match self {
Value::Tuple(v) => Some(&v),
Value::Tuple(v) => Some(v),
_ => None,
}
}
@ -92,7 +92,7 @@ impl Value {
Value::Global(ref module_name, ref attribute_name) => {
if module_name == "torch._utils" && attribute_name == "_rebuild_tensor_v2" {
match **args {
Value::Tuple(ref args) => self.to_tensor_builder2(&args),
Value::Tuple(ref args) => self.to_tensor_builder2(args),
_ => None,
}
} else {
@ -146,14 +146,14 @@ impl Value {
return None;
}
return Some(TensorBuilder {
Some(TensorBuilder {
src_path: PathBuf::from(storage_filename),
dtype,
stride: row_stride,
rows,
cols,
nitems,
});
})
}
fn to_tensor_builder2_6items(args: &[Value]) -> Option<TensorBuilder> {
@ -203,14 +203,14 @@ impl Value {
return None;
}
return Some(TensorBuilder {
Some(TensorBuilder {
src_path: PathBuf::from(storage_filename),
dtype,
stride: row_stride,
rows,
cols,
nitems,
});
})
/* Args should look like this (took random example from debug print) :
0 PERSISTENT_ID
@ -545,7 +545,7 @@ pub fn unpickle(bytes: &[u8]) -> Result<Value, UnpicklingError> {
"Stack is empty while handling LONG_BINPUT".to_string(),
));
}
memo.insert(key as u32, stack.last().unwrap().clone());
memo.insert(key, stack.last().unwrap().clone());
bytes = &bytes[5..];
continue;
}

Loading…
Cancel
Save