@ -25,11 +25,24 @@ model_type = 'RWKV'
datafile = u " V: \\ NLP \\ simplebooks \\ simplebooks-92-raw \\ train.txt "
datafile = u " V: \\ NLP \\ simplebooks \\ simplebooks-92-raw \\ train.txt "
datafile_encoding = ' utf-8 '
datafile_encoding = ' utf-8 '
# datafile = u"D:\\NLP-Data\\ww100M.txt"
# datafile = u"D:\\NLP-Data\\ww100M.txt"
# datafile = u"D:\\NLP-Data\\__2019.txt"
# datafile = u"Y:\\BlinkNLP\\_txt_\\txt\\_all.txt"
# datafile = u"Y:\\BlinkNLP\\_txt_\\txt\\_all.txt"
# datafile = u"V:\\NLP\\enwik8-shift-300.bpe"
# datafile_encoding = 'utf-16'
# datafile_encoding = 'utf-16'
# datafile = u"V:\\NLP\\simplebooks-shift-utf32.word"
# datafile_encoding = 'utf-32'
datafile_type = 0 # use 0 for char-level english. use 1 for chinese. only affects some RWKV hyperparametrs
datafile_type = 0 # use 0 for char-level english. use 1 for chinese. only affects some RWKV hyperparametrs
#################################### VERY IMPORTANT ####################################
epoch_save_frequency = 10 # 0 = never, 1 = every 'epoch', 2 = every two 'epoch', etc.
epoch_save_path = ' trained- '
batch_size = 48 # if you see "CUDA out of memory", reduce this.
# if you have good GPU, increase this.
# use GPU-Z to find the highest value for your VRAM.
########################################################################################
model_level = ' character ' # 'character' (recommended) or 'word'
model_level = ' character ' # 'character' (recommended) or 'word'
ctx_len = 256 # context length
ctx_len = 256 # context length
@ -39,11 +52,9 @@ n_embd = n_head * 64
n_attn = n_embd
n_attn = n_embd
n_ffn = n_embd
n_ffn = n_embd
batch_size = 64
n_epoch = 50 # the 'epoch' here is actually very short (and of fixed length)
n_epoch = 50 # the 'epoch' here is actually very short (and of fixed length)
lr_init = 8e-4 if model_type == ' RWKV ' else 4e-4 # RWKV can use higher lr
lr_init = 8e-4 if model_type == ' RWKV ' else 4e-4 # RWKV can use higher lr
lr_final = 2e-4
lr_final = 1e-5
betas = ( 0.9 , 0.999 ) if model_type == ' RWKV ' else ( 0.9 , 0.99 )
betas = ( 0.9 , 0.999 ) if model_type == ' RWKV ' else ( 0.9 , 0.99 )
eps = 1e-8
eps = 1e-8
@ -55,6 +66,7 @@ epoch_length_fixed = 10000 # make an 'epoch' very short
rwkv_emb_scale = 0.4 # scale of initial embedding. 0.4 is a good choice
rwkv_emb_scale = 0.4 # scale of initial embedding. 0.4 is a good choice
rwkv_tiny_attn = 64 if ( datafile_type == 0 and ctx_len > 600 ) else 0 # extra tiny attention dim, useful for long ctx char-level english
rwkv_tiny_attn = 64 if ( datafile_type == 0 and ctx_len > 600 ) else 0 # extra tiny attention dim, useful for long ctx char-level english
rwkv_tiny_head = 1 # 1 is good enough. 8 is slow
rwkv_tiny_head = 1 # 1 is good enough. 8 is slow
# n_side_proj = 512 # extra 'side projection', quite useful for BPE models
########################################################################################################
########################################################################################################
# Load data
# Load data
@ -76,6 +88,15 @@ class Dataset(Dataset):
# for u in unique:
# for u in unique:
# print(u, end=' ')
# print(u, end=' ')
# print('\n\n')
# print('\n\n')
xx = 0
xxObj = { }
for u in unique :
xxObj [ xx ] = u
xx + = 1
with open ( ' vocab.json ' , " w " , encoding = " utf-16 " ) as vocab_file :
vocab_file . write ( json . dumps ( xxObj , ensure_ascii = False ) )
data_size , vocab_size = len ( data ) , len ( unique )
data_size , vocab_size = len ( data ) , len ( unique )
print ( ' data has %d %s s, %d unique. ' % ( data_size , model_level , vocab_size ) )
print ( ' data has %d %s s, %d unique. ' % ( data_size , model_level , vocab_size ) )
self . stoi = { ch : i for i , ch in enumerate ( unique ) }
self . stoi = { ch : i for i , ch in enumerate ( unique ) }
@ -108,7 +129,7 @@ model = GPT(GPTConfig(train_dataset.vocab_size, train_dataset.ctx_len, model_typ
print ( ' model ' , model_type , ' epoch ' , n_epoch , ' batchsz ' , batch_size , ' betas ' , betas , ' eps ' , eps , ' wd ' , weight_decay , ' ctx ' , ctx_len , ' layer ' , n_layer , ' head ' , n_head , ' embd ' , n_embd , ' attn ' , n_attn , ' ffn ' , n_ffn )
print ( ' model ' , model_type , ' epoch ' , n_epoch , ' batchsz ' , batch_size , ' betas ' , betas , ' eps ' , eps , ' wd ' , weight_decay , ' ctx ' , ctx_len , ' layer ' , n_layer , ' head ' , n_head , ' embd ' , n_embd , ' attn ' , n_attn , ' ffn ' , n_ffn )
tconf = TrainerConfig ( model_type = model_type , max_epochs = n_epoch , batch_size = batch_size , weight_decay = weight_decay ,
tconf = TrainerConfig ( model_type = model_type , max_epochs = n_epoch , batch_size = batch_size , weight_decay = weight_decay ,
learning_rate = lr_init , lr_decay = True , lr_final = lr_final , betas = betas , eps = eps ,
learning_rate = lr_init , lr_decay = True , lr_final = lr_final , betas = betas , eps = eps ,
warmup_tokens = 0 , final_tokens = n_epoch * len ( train_dataset ) * ctx_len , num_workers = 0 )
warmup_tokens = 0 , final_tokens = n_epoch * len ( train_dataset ) * ctx_len , num_workers = 0 , epoch_save_frequency = epoch_save_frequency , epoch_save_path = epoch_save_path )
trainer = Trainer ( model , train_dataset , None , tconf )
trainer = Trainer ( model , train_dataset , None , tconf )
trainer . train ( )
trainer . train ( )