Make number of threads configurable and obtained by default from the system rather than hardcoding to 32.

master
Mikko Juola 3 years ago
parent 44e0abf0f1
commit ff349eeea0

@ -39,6 +39,9 @@ struct Cli {
#[arg(long)] #[arg(long)]
repetition_penalty: Option<f32>, repetition_penalty: Option<f32>,
#[arg(long)]
max_threads: Option<usize>,
#[arg(long, action)] #[arg(long, action)]
f16: bool, f16: bool,
@ -63,6 +66,17 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
let tokenizer_path = cli.tokenizer_path; let tokenizer_path = cli.tokenizer_path;
let param_path = cli.param_path; let param_path = cli.param_path;
let max_threads: usize = match cli.max_threads {
None => rayon::current_num_threads(),
Some(max_threads) => {
rayon::ThreadPoolBuilder::new()
.num_threads(max_threads)
.build_global()
.unwrap();
max_threads
}
};
let mut be_quiet: bool = false; let mut be_quiet: bool = false;
if !colored::control::SHOULD_COLORIZE.should_colorize() { if !colored::control::SHOULD_COLORIZE.should_colorize() {
be_quiet = true; be_quiet = true;
@ -218,6 +232,8 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
pln!(" norm_eps: {}", params.norm_eps); pln!(" norm_eps: {}", params.norm_eps);
pln!(" vocab_size: {}", params.vocab_size); pln!(" vocab_size: {}", params.vocab_size);
pln!("---"); pln!("---");
pln!(" maximum number of threads: {}", max_threads);
pln!("---");
pln!("Max sequence length: {}", max_seq_len); pln!("Max sequence length: {}", max_seq_len);
pln!("Temperature: {}", token_sampler.get_temperature()); pln!("Temperature: {}", token_sampler.get_temperature());
pln!("Top P: {}", token_sampler.get_top_p()); pln!("Top P: {}", token_sampler.get_top_p());

@ -1096,6 +1096,8 @@ impl Tensor {
/// Matrix multiplication done in-place, but the second matrix is transposed. /// Matrix multiplication done in-place, but the second matrix is transposed.
/// With this, you can avoid using .transpose() on the second matrix. /// With this, you can avoid using .transpose() on the second matrix.
pub fn matrix_mul_inplace_transposed(&mut self, src: &Tensor, other: &Tensor) { pub fn matrix_mul_inplace_transposed(&mut self, src: &Tensor, other: &Tensor) {
let nthreads: usize = rayon::current_num_threads();
#[cfg(feature = "opencl")] #[cfg(feature = "opencl")]
if self.is_on_gpu() && src.is_on_gpu() && other.is_on_gpu() { if self.is_on_gpu() && src.is_on_gpu() && other.is_on_gpu() {
self.matrix_mul_inplace_transposed_gpu(src, other); self.matrix_mul_inplace_transposed_gpu(src, other);
@ -1165,7 +1167,8 @@ impl Tensor {
let src_data_wrap: WrappedPtr = WrappedPtr::wrap(src.data); let src_data_wrap: WrappedPtr = WrappedPtr::wrap(src.data);
let other_data: WrappedPtr = WrappedPtr::wrap(other.data); let other_data: WrappedPtr = WrappedPtr::wrap(other.data);
let tgt_data: WrappedPtr = WrappedPtr::wrap(self.data); let tgt_data: WrappedPtr = WrappedPtr::wrap(self.data);
(0..32).into_par_iter().for_each(|thread_idx| {
(0..nthreads).into_par_iter().for_each(|thread_idx| {
let src_data: *const f32 = src_data_wrap.unwrap() as *const f32; let src_data: *const f32 = src_data_wrap.unwrap() as *const f32;
let other_data: *const f32 = other_data.unwrap() as *const f32; let other_data: *const f32 = other_data.unwrap() as *const f32;
let tgt_data: *mut f32 = tgt_data.unwrap() as *mut f32; let tgt_data: *mut f32 = tgt_data.unwrap() as *mut f32;
@ -1176,7 +1179,7 @@ impl Tensor {
let row3 = row * 4 + 3; let row3 = row * 4 + 3;
for col in 0..self_cols_its { for col in 0..self_cols_its {
let row_col = row * self_cols_its + col; let row_col = row * self_cols_its + col;
if row_col % 32 != thread_idx { if row_col % nthreads != thread_idx {
continue; continue;
} }
let col0 = col * 4; let col0 = col * 4;
@ -1386,7 +1389,7 @@ impl Tensor {
let src_data_wrap: WrappedPtr = WrappedPtr::wrap(src.data); let src_data_wrap: WrappedPtr = WrappedPtr::wrap(src.data);
let other_data: WrappedPtr = WrappedPtr::wrap(other.data); let other_data: WrappedPtr = WrappedPtr::wrap(other.data);
let tgt_data: WrappedPtr = WrappedPtr::wrap(self.data); let tgt_data: WrappedPtr = WrappedPtr::wrap(self.data);
(0..32).into_par_iter().for_each(|thread_idx| { (0..nthreads).into_par_iter().for_each(|thread_idx| {
let src_data: *const f16 = src_data_wrap.unwrap() as *const f16; let src_data: *const f16 = src_data_wrap.unwrap() as *const f16;
let other_data: *const f16 = other_data.unwrap() as *const f16; let other_data: *const f16 = other_data.unwrap() as *const f16;
let tgt_data: *mut f16 = tgt_data.unwrap() as *mut f16; let tgt_data: *mut f16 = tgt_data.unwrap() as *mut f16;
@ -1397,7 +1400,7 @@ impl Tensor {
let row3 = row * 4 + 3; let row3 = row * 4 + 3;
for col in 0..self_cols_its { for col in 0..self_cols_its {
let row_col = row * self_cols_its + col; let row_col = row * self_cols_its + col;
if row_col % 32 != thread_idx { if row_col % nthreads != thread_idx {
continue; continue;
} }
let col0 = col * 4; let col0 = col * 4;

Loading…
Cancel
Save