From 8d780208f20060e719f2c10b45b1e830e89e412d Mon Sep 17 00:00:00 2001 From: BlinkDL Date: Tue, 5 Jul 2022 21:53:49 +0800 Subject: [PATCH] typo fix --- RWKV-v3/src/model_run.py | 2 +- RWKV-v3/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/RWKV-v3/src/model_run.py b/RWKV-v3/src/model_run.py index 0957780..44ea131 100644 --- a/RWKV-v3/src/model_run.py +++ b/RWKV-v3/src/model_run.py @@ -115,7 +115,7 @@ class Block(nn.Module): if self.layer_id == 0: x = self.ln0(x) if self.layer_id == 0 and RWKV_CFG.model_type == 'RWKV-ffnPre': - x = x + self.ffnPre(x) + x = x + self.ffnPre(self.ln1(x)) else: x = x + self.att(self.ln1(x)) x = x + self.ffn(self.ln2(x)) diff --git a/RWKV-v3/train.py b/RWKV-v3/train.py index ce9eb2f..1a07cad 100644 --- a/RWKV-v3/train.py +++ b/RWKV-v3/train.py @@ -59,7 +59,7 @@ batch_size = 12 # Let's say you will train a L6-D512 model. # 1) Set lr_init = lr_final = 8e-4. Let it run for some mini-epochs, until the improvement of loss become slow. # 2) Check epoch_save_frequency and make sure the partially-trained model is saved. Ctrl+C to stop the run. -# 3) Set lr_init = 8e-4, lr_final = 1e-5, warmup_tokens = ctx_len * batch_size * 50, betas = (0.9, 0.999) +# 3) Set lr_init = 8e-4, lr_final = 1e-5, warmup_tokens = ctx_len * batch_size * 50, betas = (0.9, 0.999). # 4) Search for "torch.load" here and modify it to load the partially-trained model. Continue the training. # # For L12-D768, set lr_init = 6e-4. For L24-D1024, set lr_init = 4e-4. For L24-D2048, set lr_init = 3e-4.