Add repetition penalty, add colors to outputs based on probabilities, try to make softmax() more numerically stable.

3 years ago · 862d4a15d6
parent f4629ca987
commit 862d4a15d6
3 changed files with 108 additions and 13 deletions
--- a/README.md
+++ b/README.md
@ -56,6 +56,12 @@ cast to 32-bit floats.
 You can use `--temperature`, `--top-p` and `--top-k` to adjust token sampler
 settings.
 There is `--repetition-penalty` setting. 1.0 means no penalty. This value
 likely should be between 0 and 1. Values smaller than 1.0 give a penalty to
 tokens that appear in the context, by
 `x*(repetitition_penalty^num_occurrences)` before applying `softmax()` on the
 output probabilities. Or in other words, values smaller than 1.0 apply penalty.
 You can also use `--prompt-file` to read the prompt from a file instead from
 the command line.
--- a/src/rllama_main.rs
+++ b/src/rllama_main.rs
@ -36,6 +36,8 @@ struct Cli {
    top_p: Option<f32>,
    #[arg(long)]
    top_k: Option<i32>,
    #[arg(long)]
    repetition_penalty: Option<f32>,
    #[cfg(feature = "opencl")]
    #[arg(long)]
@ -185,7 +187,11 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
    let mut toks_id: Vec<TokenId> = tok.tokenize_to_ids(prompt.clone());
    let mut prev_pos = 0;
-    let mut token_sampler = TokenSampler::new().temperature(0.8).top_p(0.9).top_k(50);
+    let mut token_sampler = TokenSampler::new()
        .temperature(0.8)
        .top_p(0.9)
        .top_k(50)
        .repetition_penalty(0.8);
    if let Some(temperature) = cli.temperature {
        token_sampler = token_sampler.temperature(temperature);
@ -196,6 +202,9 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
    if let Some(top_k) = cli.top_k {
        token_sampler = token_sampler.top_k(top_k as usize);
    }
    if let Some(repetition_penalty) = cli.repetition_penalty {
        token_sampler = token_sampler.repetition_penalty(repetition_penalty);
    }
    pln!("---");
    pln!(" dim: {}", params.dim);
@ -209,6 +218,10 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
    pln!("Temperature: {}", token_sampler.get_temperature());
    pln!("Top P: {}", token_sampler.get_top_p());
    pln!("Top K: {}", token_sampler.get_top_k());
    pln!(
        "Repetition penalty: {}",
        token_sampler.get_repetition_penalty()
    );
    pln!("---");
    pln!(
        "{}",
@ -229,9 +242,9 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
    let mut stop_seen: bool = false;
    while toks_id.len() < max_seq_len {
        let now = std::time::Instant::now();
        let preds = tr.forward(&toks_id[prev_pos..], prev_pos, &mut caches);
-        let highest_pred_idx = token_sampler.sample(&preds);
+
        let (highest_pred_idx, token_prob) = token_sampler.sample(&preds, &tok, &toks_id);
        toks_id.push(highest_pred_idx as TokenId);
        for (tok_idx, tok_id) in toks_id[prev_pos + 1..].iter().enumerate() {
@ -252,7 +265,18 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
            if first && tok_idx < toks_id.len() - 2 {
                // intentionally left empty
            } else {
-                print!("{}", tok_str.truecolor(128, 255, 128));
+                let redness: f32 = token_prob * 255.0;
                let redness = if redness > 255.0 {
                    255
                } else if redness < 0.0 {
                    0
                } else {
                    redness as u8
                };
                print!(
                    "{}",
                    tok_str.truecolor(128 + redness / 2, 255 - redness / 2, 128)
                );
            }
        }
        if first {
--- a/src/token_sampler.rs
+++ b/src/token_sampler.rs
@ -1,11 +1,13 @@
 use crate::tensor::Tensor;
-use crate::tokenizer::TokenId;
+use crate::tokenizer::{TokenId, Tokenizer};
 use rand::Rng;
 use std::collections::BTreeMap;
 pub struct TokenSampler {
    temperature: f32,
    top_p: f32,
    top_k: usize,
    repetition_penalty: f32,
 }
 impl Default for TokenSampler {
@ -17,9 +19,11 @@ impl Default for TokenSampler {
 impl TokenSampler {
    pub fn new() -> Self {
        Self {
-            temperature: 0.8,
+            temperature: 0.2,
            top_p: 1.0,
            top_k: 1, // same as argmax
            repetition_penalty: 0.8, // 1.0 = no penalty. values above 1.0 make repetition
                      // encouraged which can quickly devolve into repeating loop
        }
    }
@ -35,6 +39,10 @@ impl TokenSampler {
        self.top_k
    }
    pub fn get_repetition_penalty(&self) -> f32 {
        self.repetition_penalty
    }
    pub fn temperature(self, temperature: f32) -> Self {
        Self {
            temperature,
@ -50,20 +58,77 @@ impl TokenSampler {
        Self { top_k, ..self }
    }
-    pub fn sample(&self, logits: &Tensor) -> TokenId {
+    pub fn repetition_penalty(self, repetition_penalty: f32) -> Self {
        Self {
            repetition_penalty,
            ..self
        }
    }
    pub fn sample(
        &self,
        logits: &Tensor,
        tokenizer: &Tokenizer,
        existing_tokens: &[TokenId],
    ) -> (TokenId, f32) {
        let mut times_used: BTreeMap<TokenId, usize> = BTreeMap::new();
        for token in existing_tokens {
            times_used
                .entry(*token)
                .and_modify(|e| *e += 1)
                .or_insert(1);
        }
        let nrows = logits.rows();
        assert!(logits.cols() == 1);
        let mut logits = logits.transpose();
        if self.temperature > 0.0 {
            logits = logits.scalar_multiply_f32(1.0 / self.temperature);
            logits = logits.softmax();
        }
        if self.repetition_penalty != 1.0 {
            for token_idx in 0..logits.rows() {
                if let Some(count) = times_used.get(&(token_idx as TokenId)) {
                    let penalty = self.repetition_penalty.powf(*count as f32);
                    logits.set_f32(0, token_idx, logits.get_f32(0, token_idx) * penalty);
                }
            }
        }
        let mut maxv: f32 = std::f32::NEG_INFINITY;
        for token_idx in 0..logits.rows() {
            let v = logits.get_f32(0, token_idx);
            if v > maxv {
                maxv = v;
            }
        }
        // To numerically stabilize, remove maxv from all logits
        // softmax(x + c) = softmax(x) where c is a constant, and we make use of htat
        for token_idx in 0..logits.rows() {
            logits.set_f32(0, token_idx, logits.get_f32(0, token_idx) - maxv);
        }
        logits = logits.softmax();
        let mut logitsf: Vec<(TokenId, f32)> = Vec::with_capacity(nrows as usize);
        for i in 0..nrows {
-            logitsf.push((i as TokenId, logits.get_f32(0, i)));
+            let score = logits.get_f32(0, i);
            logitsf.push((i as TokenId, score));
        }
-        logitsf.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+        logitsf.sort_unstable_by(|a, b| {
            match b.1.partial_cmp(&a.1) {
                Some(c) => c,
                None => {
                    // Sort NaNs to bottom
                    if b.1.is_nan() {
                        return std::cmp::Ordering::Less;
                    } else if a.1.is_nan() {
                        return std::cmp::Ordering::Greater;
                    } else {
                        return std::cmp::Ordering::Equal;
                    }
                }
            }
        });
        logitsf.truncate(self.top_k);
        let mut p_accum: f32 = 0.0;
        for (idx, v) in logitsf.iter().enumerate() {
@ -78,14 +143,14 @@ impl TokenSampler {
            total_p += v.1;
        }
        let mut rng = rand::thread_rng();
-        let p: f32 = rng.gen_range(0.0..total_p);
+        let p: f32 = rng.gen_range(0.0..=total_p);
        p_accum = 0.0;
        for v in logitsf.into_iter() {
            p_accum += v.1;
            if p_accum >= p {
-                return v.0;
+                return (v.0, v.1 / total_p);
            }
        }
-        0
+        (0, 0.0)
    }
 }