more training tips

main
BlinkDL 3 years ago
parent 5f6e9356a2
commit 6667ad18c2

@ -68,8 +68,8 @@ def RUN_CUDA(B, T, C, w, u, k, v):
######################################################################################################## ########################################################################################################
def RWKV_Init(module, config): # fancy initialization of all lin & emb layer in the module def RWKV_Init(module, config): # fancy initialization of all lin & emb layer in the module
print('\n[--> first run, init model params (very slow for large models) <--]\n') print('\n[--> first run, init model params (very slow for large models) <--]')
print('\n[so you shall only do it for 1 single GPU and save the checkpt and load it when using multiple GPU]\n') print('[so you shall only do it for 1 single GPU and save the checkpt and load it when using multiple GPU]\n')
for m in module.modules(): for m in module.modules():
if not isinstance(m, (nn.Linear, nn.Embedding)): if not isinstance(m, (nn.Linear, nn.Embedding)):
continue continue

@ -7,7 +7,12 @@ import os
os.environ['USE_WANDB'] = '0' # 0 = False, 1 = True os.environ['USE_WANDB'] = '0' # 0 = False, 1 = True
### This is using DeepSpeed stage2 + FP16 ############################################################## ### This is using DeepSpeed stage2 + FP16 ##############################################################
#
# Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training:
# 1) leave RWKV_NUM_GPUS = '1' and let it run for 1 'mini-epoch' and it will save a 'trained-1.pth'
# 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = NUM_GPUS * single_gpu_batchsz,
# EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training
#
os.environ['RWKV_NUM_GPUS'] = '1' # num of GPUs to use os.environ['RWKV_NUM_GPUS'] = '1' # num of GPUs to use
NUM_GPUS = int(os.environ['RWKV_NUM_GPUS']) NUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])

Loading…
Cancel
Save