diff --git a/RWKV-v4/run.py b/RWKV-v4/run.py index 22fcc50..1c1c2fb 100644 --- a/RWKV-v4/run.py +++ b/RWKV-v4/run.py @@ -57,6 +57,11 @@ elif TOKEN_MODE == 'pile': # n_embd = 1024 # ctx_len = 1024 + # MODEL_NAME = 'RWKV-4-Pile-1B5-20220903-8040' + # n_layer = 24 + # n_embd = 2048 + # ctx_len = 1024 + os.environ['RWKV_FLOAT_MODE'] = 'fp32' # 'bf16' / 'fp16' / 'fp32' (note: only using fp32 at this moment) os.environ['RWKV_RUN_DEVICE'] = 'cpu' # 'cpu' (already very fast) or 'cuda' model_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre' diff --git a/RWKV-v4/src/model.py b/RWKV-v4/src/model.py index 7434ccb..e4faf89 100644 --- a/RWKV-v4/src/model.py +++ b/RWKV-v4/src/model.py @@ -278,7 +278,7 @@ class Block(nn.Module): self.ln0 = nn.LayerNorm(config.n_embd) if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre': - self.ffnPre = RWKV_ChannelMix(config, layer_id+1000) + self.ffnPre = RWKV_ChannelMix(config, 0) else: self.att = RWKV_TimeMix(config, layer_id)