DEBUG_DEBUG=False# True False - show softmax output
# context = 'A'
# context = "\nIn the"
# context = '\nSugar:'
context='\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'
NUM_TRIALS=999
LENGTH_PER_TRIAL=500
LENGTH_PER_TRIAL=333
TEMPERATURE=1.0
top_p=0.7
top_p_newline=0.9
top_p_newline=0.9# only used in TOKEN_MODE = char
DEBUG_DEBUG=False# True False --> show softmax output
print('\nYour prompt has '+str(src_len)+' tokens.')
print('\n--> Currently the first run takes a while if your prompt is long, as we are using RNN to process the prompt. Use GPT to build the hidden state for better speed. <--\n')
os.environ['RWKV_FLOAT_MODE']='bf16'# 'bf16' (stable) or 'fp16' (will overflow after training a large model for very long. can be solved in the future)
### This is using DeepSpeed stage2 + FP16 ##############################################################
#
# Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training:
# 1) leave RWKV_NUM_GPUS = '1' and let it run for 1 'mini-epoch' and it will save a 'trained-1.pth'
# 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = NUM_GPUS * single_gpu_batchsz,
# EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training
#
os.environ['RWKV_NUM_GPUS']='1'# num of GPUs to use
NUM_GPUS=int(os.environ['RWKV_NUM_GPUS'])
### Change these if you want to continue training from a saved model ###################################
EPOCH_BEGIN=0
LOAD_MODEL=False# shall we continue from the #EPOCH_BEGIN model?
# if False: # True False ---> Set to False if you don't understand it
# print("\n\n[[[ SPECIAL DEBUG MODE FOR MYSELF. DON'T ENABLE THIS IF YOU DON'T UNDERSTAND IT ]]]\n\n")
# import src.utils
# src.utils.set_seed(42) # make training deterministic (including dataloader). if you are doing this, remember to change seed when you load a model (otherwise the dataloader loads old samples)
importlogging,types
fromsrc.utilsimportDataset
importtorch
importnumpyasnp
fromsrc.binidximportMMapIndexedDataset# for the Megatron-LM 'binidx' format
### Step 1: set training data ##########################################################################
# if False: # True False ---> Set to False if you don't understand it
# print("\n\n[[[ SPECIAL DEBUG MODE FOR MYSELF. DON'T ENABLE THIS IF YOU DON'T UNDERSTAND IT ]]]\n\n")
# import src.utils
# src.utils.set_seed(42) # make training deterministic (including dataloader). if you are doing this, remember to change seed when you load a model (otherwise the dataloader loads old samples)
### Step 2: set model size #############################################################################
ifEXPRESS_PILE_MODE:
datafile='train.npy'# use 'prepare-data.py' in https://github.com/BlinkDL/RWKV-v2-RNN-Pile/tree/main/RWKV-v3 to tokenize .txt into .npy
datafile_encoding='numpy'
#
# set VOCAB_SIZE = 0 (auto-compute) if you are training a char-level LM from scratch
# set VOCAB_SIZE = 50277 for fine-tuning pile models
# set VOCAB_SIZE = your_vocab_size for 'binidx' data
#
os.environ['VOCAB_SIZE']='0'
ifEXPRESS_PILE_MODE:
os.environ['VOCAB_SIZE']='50277'
#
# Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training:
# 1) set RWKV_NUM_GPUS = '1' and let it run for 1 miniEpoch and it will save a trained-1.pth
# 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = single_gpu_batchsz * RWKV_NUM_GPUS,
# EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training from it
#
os.environ['RWKV_NUM_GPUS']='1'# num of GPUs to use
os.environ['RWKV_FLOAT_MODE']='bf16'# 'bf16' (stable) or 'fp16' (will overflow after training a large model for very long. can be solved in the future) or 'fp32'
context='\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'