diff --git a/RWKV-v4/src/model.py b/RWKV-v4/src/model.py index 6151b65..40c279c 100644 --- a/RWKV-v4/src/model.py +++ b/RWKV-v4/src/model.py @@ -68,8 +68,8 @@ def RUN_CUDA(B, T, C, w, u, k, v): ######################################################################################################## def RWKV_Init(module, config): # fancy initialization of all lin & emb layer in the module - print('\n[--> first run, init model params (very slow for large models) <--]\n') - print('\n[so you shall only do it for 1 single GPU and save the checkpt and load it when using multiple GPU]\n') + print('\n[--> first run, init model params (very slow for large models) <--]') + print('[so you shall only do it for 1 single GPU and save the checkpt and load it when using multiple GPU]\n') for m in module.modules(): if not isinstance(m, (nn.Linear, nn.Embedding)): continue diff --git a/RWKV-v4/train.py b/RWKV-v4/train.py index 865a715..4c670d3 100644 --- a/RWKV-v4/train.py +++ b/RWKV-v4/train.py @@ -7,7 +7,12 @@ import os os.environ['USE_WANDB'] = '0' # 0 = False, 1 = True ### This is using DeepSpeed stage2 + FP16 ############################################################## - +# +# Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training: +# 1) leave RWKV_NUM_GPUS = '1' and let it run for 1 'mini-epoch' and it will save a 'trained-1.pth' +# 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = NUM_GPUS * single_gpu_batchsz, +# EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training +# os.environ['RWKV_NUM_GPUS'] = '1' # num of GPUs to use NUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])