rescale to avoid FP16 overflow

3 years ago · 2567c8c904
parent aef9f6f7ef
commit 2567c8c904
2 changed files with 15 additions and 1 deletions
--- a/RWKV-v4neo/run.py
+++ b/RWKV-v4neo/run.py
@ -124,7 +124,8 @@ from src.model_run import RWKV_RNN
 model = RWKV_RNN(args)
 print(f'\nOptimizing speed...')
-model.forward([187], None)
+out, _ = model.forward([187], None)
 # print(out)
 gc.collect()
 torch.cuda.empty_cache()
--- a/RWKV-v4neo/src/model_run.py
+++ b/RWKV-v4neo/src/model_run.py
@ -22,6 +22,8 @@ print(f'\nRWKV_HEAD_QK_DIM {RWKV_HEAD_QK_DIM}\n')
 DEBUG_TIME = False   # True False - show trained time-coeffs
 RWKV_RESCALE_LAYER = 6 # set x=x/2 every X layer
 ############################################################################################################
 class RWKV_RNN(nn.Module):
@ -41,6 +43,14 @@ class RWKV_RNN(nn.Module):
            keys = list(w.keys())
            print_need_newline = False
            for x in keys:
                block_id = 0
                if 'blocks.' in x:
                    block_id = int(x.split('.')[1])
                if 'att.output.weight' in x:
                    w[x] = w[x] / (2 ** int(block_id // RWKV_RESCALE_LAYER))
                if 'ffn.value.weight' in x:
                    w[x] = w[x] / (2 ** int(block_id // RWKV_RESCALE_LAYER))
                if '.time_' in x:
                    w[x] = w[x].squeeze()
                    if DEBUG_TIME:
@ -209,6 +219,9 @@ class RWKV_RNN(nn.Module):
                    ww.time_mix_k, ww.time_mix_r, 
                    ww.key.weight, ww.value.weight, ww.receptance.weight)
                if (i+1) % RWKV_RESCALE_LAYER == 0:
                    x = x / 2
            if preprocess_only:
                return state