more training tips

4 years ago · 6667ad18c2
parent 5f6e9356a2
commit 6667ad18c2
2 changed files with 8 additions and 3 deletions
--- a/RWKV-v4/src/model.py
+++ b/RWKV-v4/src/model.py
@ -68,8 +68,8 @@ def RUN_CUDA(B, T, C, w, u, k, v):
 ########################################################################################################
 def RWKV_Init(module, config):  # fancy initialization of all lin & emb layer in the module
-    print('\n[--> first run, init model params (very slow for large models) <--]\n')
+    print('\n[--> first run, init model params (very slow for large models) <--]')
-    print('\n[so you shall only do it for 1 single GPU and save the checkpt and load it when using multiple GPU]\n')
+    print('[so you shall only do it for 1 single GPU and save the checkpt and load it when using multiple GPU]\n')
    for m in module.modules():
        if not isinstance(m, (nn.Linear, nn.Embedding)):
            continue
--- a/RWKV-v4/train.py
+++ b/RWKV-v4/train.py
@ -7,7 +7,12 @@ import os
 os.environ['USE_WANDB'] = '0' # 0 = False, 1 = True
 ### This is using DeepSpeed stage2 + FP16 ##############################################################
-
+# 
 # Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training:
 # 1) leave RWKV_NUM_GPUS = '1' and let it run for 1 'mini-epoch' and it will save a 'trained-1.pth'
 # 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = NUM_GPUS * single_gpu_batchsz, 
 #    EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training
 #
 os.environ['RWKV_NUM_GPUS'] = '1' # num of GPUs to use
 NUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])