supports RWKV-4 pile models

main
BlinkDL 3 years ago
parent 61b7c429df
commit 68c486ad10

File diff suppressed because it is too large Load Diff

@ -1,37 +0,0 @@
{
"zero_allow_untested_optimizer":true,
"zero_optimization":{
"stage":2,
"contiguous_gradients":true,
"overlap_comm":true,
"allgather_partitions":true,
"reduce_scatter":true,
"allgather_bucket_size":200000000,
"reduce_bucket_size":200000000,
"sub_group_size":1000000000000
},
"activation_checkpointing":{
"partition_activations":false,
"cpu_checkpointing":false,
"contiguous_memory_optimization":false,
"synchronize_checkpoint_boundary":false
},
"aio":{
"block_size":1048576,
"queue_depth":8,
"single_submit":false,
"overlap_events":true,
"thread_count":1
},
"gradient_clipping": 1.0,
"gradient_accumulation_steps": 1,
"fp16": {
"fp16": true,
"enabled": true,
"loss_scale": 0,
"initial_scale_power": 12,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
}
}

@ -15,70 +15,94 @@ torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
np.set_printoptions(precision=4, suppress=True, linewidth=200)
### Step 1: set model ##################################################################################
########################################################################################################
# Step 1: set model
#
# Set TOKEN_MODE to 'char' or 'bpe' if the model is trained by 'train.py' from scratch.
#
# Set TOKEN_MODE to 'pile' if you want to test pre-trained pile models.
########################################################################################################
os.environ['RWKV_FLOAT_MODE'] = 'bf16' # 'bf16' or 'fp16'
os.environ['RWKV_RUN_DEVICE'] = 'cpu' # 'cpu' (already very fast) or 'cuda'
RUN_DEVICE = os.environ['RWKV_RUN_DEVICE']
TOKEN_MODE = 'char' # char / bpe / pile
ctx_len = 1024
n_layer = 6
n_embd = 512
model_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre'
### Step 2: set vocab & context ########################################################################
CHAR_MODE = True # True False
ctx_len = 1024
if CHAR_MODE:
### example 1: char-level model
if TOKEN_MODE == 'char':
MODEL_NAME = 'trained-500' # your trained model
WORD_NAME = 'vocab' # the .json vocab (generated by train.py)
# --> set UNKNOWN_CHAR to the rarest token in your vocab.json <--
# --> all unknown tokens in your context will be denoted by it <--
UNKNOWN_CHAR = ' ' # here we just set it to [space] for simplicity
context = "\nIn the" # your prompt
else:
### example 2: BPE-level model
MODEL_NAME = 'trained-7773'
WORD_NAME = ['model-vocab.json', 'model-merges.txt'] # [vocab, merge]
# set UNKNOWN_CHAR to the rarest token in your vocab.json, and all unknown tokens in your prompt will be denoted by it
UNKNOWN_CHAR = ' ' # here we just set it to ' ' for simplicity
elif TOKEN_MODE == 'bpe':
MODEL_NAME = 'trained-500' # your trained model
WORD_NAME = ['model-vocab.json', 'model-merges.txt'] # [vocab, merge] for your BPE model
UNKNOWN_CHAR = None
context = 'A'
### Step 3: other config ###############################################################################
elif TOKEN_MODE == 'pile':
WORD_NAME = ['20B_tokenizer.json', '20B_tokenizer.json']
UNKNOWN_CHAR = None
#---> you can set MODEL_NAME to your fine-tuned model <---
MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'
# MODEL_NAME = 'trained-11'
n_layer = 12
n_embd = 768
ctx_len = 1024
# MODEL_NAME = 'RWKV-4-Pile-430M-20220808-8066'
# n_layer = 24
# n_embd = 1024
# ctx_len = 1024
os.environ['RWKV_FLOAT_MODE'] = 'fp32' # 'bf16' / 'fp16' / 'fp32' (note: only using fp32 at this moment)
os.environ['RWKV_RUN_DEVICE'] = 'cpu' # 'cpu' (already very fast) or 'cuda'
model_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre'
########################################################################################################
# Step 2: set prompt & sampling stuffs
########################################################################################################
DEBUG_DEBUG = False # True False - show softmax output
# context = 'A'
# context = "\nIn the"
# context = '\nSugar:'
context = '\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'
NUM_TRIALS = 999
LENGTH_PER_TRIAL = 500
LENGTH_PER_TRIAL = 333
TEMPERATURE = 1.0
top_p = 0.7
top_p_newline = 0.9
top_p_newline = 0.9 # only used in TOKEN_MODE = char
DEBUG_DEBUG = False # True False --> show softmax output
########################################################################################################
print(f'Loading {MODEL_NAME}...')
from src.model_run import RWKV_RNN
model = RWKV_RNN(MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)
model = RWKV_RNN(MODEL_NAME, os.environ['RWKV_RUN_DEVICE'], model_type, n_layer, n_embd, ctx_len)
tokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)
########################################################################################################
if tokenizer.charMode:
context = tokenizer.refine_context(context)
print('\nYour prompt has ' + str(len(context)) + ' tokens.')
ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]
else:
ctx = tokenizer.tokenizer.encode(context)
src_len = len(ctx)
src_ctx = ctx.copy()
print('\nYour prompt has ' + str(src_len) + ' tokens.')
print('\n--> Currently the first run takes a while if your prompt is long, as we are using RNN to process the prompt. Use GPT to build the hidden state for better speed. <--\n')
for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):
t_begin = time.time_ns()
src_len = len(context)
if tokenizer.charMode:
ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]
else:
ctx = tokenizer.tokenizer.encode(context)
print(('-' * 30) + context, end='')
ctx = src_ctx.copy()
model.clear()
if TRIAL == 0:
init_state = types.SimpleNamespace()
@ -104,6 +128,9 @@ for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):
print('model', np.array(x), '==>', np.array(
out), np.max(out), np.min(out))
if TOKEN_MODE == 'pile':
out[0] = -999999999 # disable <|endoftext|>
char = tokenizer.sample_logits(out, x, ctx_len, temperature=TEMPERATURE,
top_p_usual=top_p, top_p_newline=top_p_newline)
char = char.item()
@ -112,5 +139,6 @@ for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):
else:
print(tokenizer.tokenizer.decode(int(char)), end='', flush=True)
ctx += [char]
t_end = time.time_ns()
print("\n----------", round((t_end - t_begin) / (10 ** 9), 2), end='s ')

@ -26,7 +26,6 @@ from torch.utils.cpp_extension import load
wkv_cuda = load(name="wkv", sources=["cuda/wkv_op.cpp", "cuda/wkv_cuda.cu"],
verbose=True, extra_cuda_cflags=['--use_fast_math', '--extra-device-vectorization', f'-DTmax={T_MAX}'])
if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
class WKV(torch.autograd.Function):
@staticmethod
def forward(ctx, B, T, C, w, u, k, v):
@ -35,48 +34,25 @@ if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
ctx.C = C
assert T <= T_MAX
assert B * C % min(C, 1024) == 0
if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
w = -torch.exp(w.float().contiguous())
u = u.float().contiguous()
k = k.float().contiguous()
v = v.float().contiguous()
else:
w = -torch.exp(w.contiguous())
u = u.contiguous()
k = k.contiguous()
v = v.contiguous()
ctx.save_for_backward(w, u, k, v)
y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
wkv_cuda.forward(B, T, C, w, u, k, v, y)
if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
return y.half()
@staticmethod
def backward(ctx, gy):
B = ctx.B
T = ctx.T
C = ctx.C
assert T <= T_MAX
assert B * C % min(C, 1024) == 0
w, u, k, v = ctx.saved_tensors
gw = torch.zeros((B, C), device='cuda')
gu = torch.zeros((B, C), device='cuda')
gk = torch.zeros((B, T, C), device='cuda')
gv = torch.zeros((B, T, C), device='cuda')
wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
gw = torch.sum(gw, dim=0)
gu = torch.sum(gu, dim=0)
return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
class WKV(torch.autograd.Function):
@staticmethod
def forward(ctx, B, T, C, w, u, k, v):
ctx.B = B
ctx.T = T
ctx.C = C
assert T <= T_MAX
assert B * C % min(C, 1024) == 0
w = -torch.exp(w.float().contiguous())
u = u.float().contiguous()
k = k.float().contiguous()
v = v.float().contiguous()
ctx.save_for_backward(w, u, k, v)
y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
wkv_cuda.forward(B, T, C, w, u, k, v, y)
return y.bfloat16()
elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
return y
@staticmethod
def backward(ctx, gy):
@ -90,10 +66,18 @@ elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
gu = torch.zeros((B, C), device='cuda')
gk = torch.zeros((B, T, C), device='cuda')
gv = torch.zeros((B, T, C), device='cuda')
if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
else:
wkv_cuda.backward(B, T, C, w, u, k, v, gy.contiguous(), gw, gu, gk, gv)
gw = torch.sum(gw, dim=0)
gu = torch.sum(gu, dim=0)
if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
return (None, None, None, gw.bfloat16(), gu.bfloat16(), gk.bfloat16(), gv.bfloat16())
elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
return (None, None, None, gw, gu, gk, gv)
def RUN_CUDA(B, T, C, w, u, k, v):
return WKV.apply(B, T, C, w.cuda(), u.cuda(), k.cuda(), v.cuda())
@ -376,6 +360,8 @@ class GPT(nn.Module):
c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).half()
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).bfloat16()
elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
c = c @ F.one_hot(idx, num_classes=self.config.vocab_size)
x = self.head(x) + c
else:

@ -26,7 +26,6 @@ if os.environ['RWKV_RUN_DEVICE'] == 'cuda':
wkv_cuda = load(name="wkv", sources=["cuda/wkv_op.cpp", "cuda/wkv_cuda.cu"],
verbose=True, extra_cuda_cflags=['--use_fast_math', '--extra-device-vectorization', f'-DTmax={T_MAX}'])
if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
class WKV(torch.autograd.Function):
@staticmethod
def forward(ctx, B, T, C, w, u, k, v):
@ -35,48 +34,25 @@ if os.environ['RWKV_RUN_DEVICE'] == 'cuda':
ctx.C = C
assert T <= T_MAX
assert B * C % min(C, 1024) == 0
if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
w = -torch.exp(w.float().contiguous())
u = u.float().contiguous()
k = k.float().contiguous()
v = v.float().contiguous()
else:
w = -torch.exp(w.contiguous())
u = u.contiguous()
k = k.contiguous()
v = v.contiguous()
ctx.save_for_backward(w, u, k, v)
y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
wkv_cuda.forward(B, T, C, w, u, k, v, y)
if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
return y.half()
@staticmethod
def backward(ctx, gy):
B = ctx.B
T = ctx.T
C = ctx.C
assert T <= T_MAX
assert B * C % min(C, 1024) == 0
w, u, k, v = ctx.saved_tensors
gw = torch.zeros((B, C), device='cuda')
gu = torch.zeros((B, C), device='cuda')
gk = torch.zeros((B, T, C), device='cuda')
gv = torch.zeros((B, T, C), device='cuda')
wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
gw = torch.sum(gw, dim=0)
gu = torch.sum(gu, dim=0)
return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
class WKV(torch.autograd.Function):
@staticmethod
def forward(ctx, B, T, C, w, u, k, v):
ctx.B = B
ctx.T = T
ctx.C = C
assert T <= T_MAX
assert B * C % min(C, 1024) == 0
w = -torch.exp(w.float().contiguous())
u = u.float().contiguous()
k = k.float().contiguous()
v = v.float().contiguous()
ctx.save_for_backward(w, u, k, v)
y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
wkv_cuda.forward(B, T, C, w, u, k, v, y)
return y.bfloat16()
elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
return y
@staticmethod
def backward(ctx, gy):
@ -90,10 +66,18 @@ if os.environ['RWKV_RUN_DEVICE'] == 'cuda':
gu = torch.zeros((B, C), device='cuda')
gk = torch.zeros((B, T, C), device='cuda')
gv = torch.zeros((B, T, C), device='cuda')
if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
else:
wkv_cuda.backward(B, T, C, w, u, k, v, gy.contiguous(), gw, gu, gk, gv)
gw = torch.sum(gw, dim=0)
gu = torch.sum(gu, dim=0)
if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
return (None, None, None, gw.bfloat16(), gu.bfloat16(), gk.bfloat16(), gv.bfloat16())
elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
return (None, None, None, gw, gu, gk, gv)
def RUN_CUDA(B, T, C, w, u, k, v):
return WKV.apply(B, T, C, w.cuda(), u.cuda(), k.cuda(), v.cuda())

@ -19,6 +19,21 @@ torch.backends.cudnn.benchmark = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
class L2Wrap(torch.autograd.Function):
@staticmethod
def forward(ctx, loss, y):
ctx.save_for_backward(y)
return loss
@staticmethod
def backward(ctx, grad_output):
y = ctx.saved_tensors[0]
# to encourage the logits to be close to 0
factor = 1e-4 / (y.shape[0] * y.shape[1])
maxx, ids = torch.max(y, -1, keepdim=True)
gy = torch.zeros_like(y)
gy.scatter_(-1, ids, maxx * factor)
return (grad_output, gy)
class TrainerConfig:
batch_size = 64
learning_rate = 4e-4
@ -109,14 +124,15 @@ class Trainer(LightningLite):
for it, (x, y) in pbar:
with torch.set_grad_enabled(is_train):
_, loss = model(x, y) # forward the model
yyy, loss = model(x, y) # forward the model
lossL2 = L2Wrap.apply(loss, yyy)
all_loss = [loss.clone() for _ in range(NUM_GPUS)]
torch.distributed.all_gather(all_loss, loss)
if is_train: # backprop and update the parameters
model.zero_grad()
self.backward(loss)
self.backward(lossL2)
# deepspeed will handle gradient_clipping

@ -22,13 +22,19 @@ class Dataset(Dataset):
self.data = data
if 'MMapIndexedDataset' in str(type(self.data)):
self.vocab_size = 253 # your vocab_size
self.vocab_size = int(os.environ['VOCAB_SIZE'])
print('current vocab size =', self.vocab_size, "(make sure it's correct)")
self.data_size = len(self.data._bin_buffer) // 2
self.item_cnt = len(self.data)
print(f'data has {self.data_size} tokens.')
elif 'numpy' in str(type(self.data)):
self.vocab_size = int(os.environ['VOCAB_SIZE'])
print('current vocab size =', self.vocab_size, "(make sure it's correct)")
self.data_size = len(self.data)
print(f'data has {self.data_size} tokens.')
else:
print('building token list...', end=' ')
unique = sorted(list(set(data)))
self.vocab_size = len(unique)
# print()
# for u in unique:
# print(u, end=' ')
@ -41,25 +47,25 @@ class Dataset(Dataset):
xx += 1
with open('vocab.json', "w", encoding="utf-16") as vocab_file:
vocab_file.write(json.dumps(xxObj, ensure_ascii=False))
data_size, vocab_size = len(data), len(unique)
print('data has %d tokens, %d unique.' % (data_size, vocab_size))
self.data_size = len(self.data)
print('data has %d tokens, %d unique.' % (self.data_size, self.vocab_size))
self.stoi = {ch: i for i, ch in enumerate(unique)}
self.itos = {i: ch for i, ch in enumerate(unique)}
self.vocab_size = vocab_size
def __len__(self):
return self.epoch_length_fixed // NUM_GPUS
def __getitem__(self, idx):
# cheat: pick a random spot in dataset
if 'MMapIndexedDataset' in str(type(self.data)):
#
# we are cheating: pick a random spot in dataset
#
i = np.random.randint(0, self.data_size - (self.ctx_len + 1))
if 'MMapIndexedDataset' in str(type(self.data)):
dix = self.data.get(idx=0, offset=i, length=self.ctx_len + 1).astype(int)
elif 'numpy' in str(type(self.data)):
dix = self.data[i:i+self.ctx_len+1]
else:
i = np.random.randint(0, len(self.data) - (self.ctx_len + 1))
chunk = self.data[i:i+self.ctx_len+1]
dix = [self.stoi[s] for s in chunk]
dix = [self.stoi[s] for s in self.data[i:i+self.ctx_len+1]]
x = torch.tensor(dix[:-1], dtype=torch.long)
y = torch.tensor(dix[1:], dtype=torch.long)
@ -70,6 +76,10 @@ class TOKENIZER():
def __init__(self, WORD_NAME, UNKNOWN_CHAR='\ue083'):
if 'list' in str(type(WORD_NAME)):
self.charMode = False
if WORD_NAME[0] == WORD_NAME[1]:
from transformers import PreTrainedTokenizerFast
self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=WORD_NAME[0])
else:
from transformers import GPT2TokenizerFast
self.tokenizer = GPT2TokenizerFast(WORD_NAME[0], WORD_NAME[1])
else:
@ -85,7 +95,6 @@ class TOKENIZER():
self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR]
def refine_context(self, context):
if self.charMode:
context = context.strip().split('\n')
for c in range(len(context)):
context[c] = context[c].strip().strip('\u3000').strip('\r')
@ -93,7 +102,6 @@ class TOKENIZER():
context = '\n' + ('\n'.join(context)).strip()
if context == '':
context = '\n'
return context
def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None):

@ -3,39 +3,11 @@
########################################################################################################
import os
os.environ['USE_WANDB'] = '0' # 0 = False, 1 = True
os.environ['RWKV_FLOAT_MODE'] = 'bf16' # 'bf16' (stable) or 'fp16' (will overflow after training a large model for very long. can be solved in the future)
### This is using DeepSpeed stage2 + FP16 ##############################################################
#
# Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training:
# 1) leave RWKV_NUM_GPUS = '1' and let it run for 1 'mini-epoch' and it will save a 'trained-1.pth'
# 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = NUM_GPUS * single_gpu_batchsz,
# EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training
#
os.environ['RWKV_NUM_GPUS'] = '1' # num of GPUs to use
NUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])
### Change these if you want to continue training from a saved model ###################################
EPOCH_BEGIN = 0
LOAD_MODEL = False # shall we continue from the #EPOCH_BEGIN model?
os.environ['RWKV_LOAD_MODEL'] = str(LOAD_MODEL)
########################################################################################################
# if False: # True False ---> Set to False if you don't understand it
# print("\n\n[[[ SPECIAL DEBUG MODE FOR MYSELF. DON'T ENABLE THIS IF YOU DON'T UNDERSTAND IT ]]]\n\n")
# import src.utils
# src.utils.set_seed(42) # make training deterministic (including dataloader). if you are doing this, remember to change seed when you load a model (otherwise the dataloader loads old samples)
import logging, types
from src.utils import Dataset
import torch
import numpy as np
from src.binidx import MMapIndexedDataset # for the Megatron-LM 'binidx' format
from src.binidx import MMapIndexedDataset
np.set_printoptions(precision=4, suppress=True, linewidth=200)
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@ -44,35 +16,95 @@ torch.backends.cudnn.benchmark = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
### Step 1: set training data ##########################################################################
# if False: # True False ---> Set to False if you don't understand it
# print("\n\n[[[ SPECIAL DEBUG MODE FOR MYSELF. DON'T ENABLE THIS IF YOU DON'T UNDERSTAND IT ]]]\n\n")
# import src.utils
# src.utils.set_seed(42) # make training deterministic (including dataloader). if you are doing this, remember to change seed when you load a model (otherwise the dataloader loads old samples)
########################################################################################################
# Step 1: set training data & cfg
########################################################################################################
EXPRESS_PILE_MODE = False # True: express mode for fine-tuning a pile model // False: usual training
EXPRESS_PILE_MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'
EXPRESS_PILE_MODEL_TYPE = 'RWKV-4-Pile-169M'
# EXPRESS_PILE_MODEL_NAME = 'RWKV-4-Pile-430M-20220808-8066'
# EXPRESS_PILE_MODEL_TYPE = 'RWKV-4-Pile-430M'
########################################################################################################
datafile = "../data/enwik8" # your data
datafile_encoding = 'utf-8' # 'utf-8' 'utf-16le' 'binidx'
datafile_encoding = 'utf-8' # 'utf-8' / 'utf-16le' / 'numpy' (for fine-tuning pile models) / 'binidx' (the Megatron-LM 'binidx' format)
# datafile = './my-gpt_seq_document'
# datafile = 'my-gpt_seq_document'
# datafile_encoding = 'binidx'
### Step 2: set model size #############################################################################
if EXPRESS_PILE_MODE:
datafile = 'train.npy' # use 'prepare-data.py' in https://github.com/BlinkDL/RWKV-v2-RNN-Pile/tree/main/RWKV-v3 to tokenize .txt into .npy
datafile_encoding = 'numpy'
#
# set VOCAB_SIZE = 0 (auto-compute) if you are training a char-level LM from scratch
# set VOCAB_SIZE = 50277 for fine-tuning pile models
# set VOCAB_SIZE = your_vocab_size for 'binidx' data
#
os.environ['VOCAB_SIZE'] = '0'
if EXPRESS_PILE_MODE:
os.environ['VOCAB_SIZE'] = '50277'
#
# Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training:
# 1) set RWKV_NUM_GPUS = '1' and let it run for 1 miniEpoch and it will save a trained-1.pth
# 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = single_gpu_batchsz * RWKV_NUM_GPUS,
# EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training from it
#
os.environ['RWKV_NUM_GPUS'] = '1' # num of GPUs to use
os.environ['RWKV_FLOAT_MODE'] = 'bf16' # 'bf16' (stable) or 'fp16' (will overflow after training a large model for very long. can be solved in the future) or 'fp32'
os.environ['USE_WANDB'] = '0' # wandb logging. 0 = False, 1 = True
########################################################################################################
# Step 2: set model details
########################################################################################################
EPOCH_BEGIN = 0 # begins with miniEpoch = EPOCH_BEGIN
LOAD_MODEL = False # shall we load the #EPOCH_BEGIN model and continue the training from it?
ctx_len = 1024 # increase T_MAX in model.py if your ctx_len is very long
n_layer = 6
n_embd = 512
ctx_len = 1024 # increase T_MAX in src/model.py if your ctx_len is very long
# 'RWKV' or 'RWKV-ffnPre' (better in some cases)
model_type = 'RWKV'
model_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre' (sometimes better)
# ---> there is also a RWKV_HEAD_QK_DIM in model.py and model_run.py <---
# set it to 256, then it's using my headQK trick (similar to a tiny attention) to improve loss
# there is also a RWKV_HEAD_QK_DIM in model.py and model_run.py
# set it to 256, then it's using my headQK trick (a tiny attention) to improve loss
# set it to 0, then it's a pure RNN (attention-free)
### Step 3: set batch size #############################################################################
if EXPRESS_PILE_MODE:
LOAD_MODEL = True
if EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-169M':
n_layer = 12
n_embd = 768
ctx_len = 1024
elif EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-430M':
n_layer = 24
n_embd = 1024
ctx_len = 1024
elif EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-1B5':
n_layer = 24
n_embd = 2048
ctx_len = 1024
########################################################################################################
# Step 3: set batch size & learning rate etc.
########################################################################################################
# if you see "CUDA out of memory", reduce batch_size. Use nvidia-smi to find the highest value for your GPU.
batch_size = 12 * NUM_GPUS
assert (batch_size % NUM_GPUS == 0)
batch_size = 12 * int(os.environ['RWKV_NUM_GPUS'])
assert (batch_size % int(os.environ['RWKV_NUM_GPUS']) == 0)
### Step 4: set learning rate, number of mini-epochs #######################################################
#
# By default we are using exponential LR decay.
# Here are my suggestions for training.
# Let's say you are training a L6-D512 model.
@ -93,34 +125,51 @@ epoch_length_fixed = (10000 // batch_size) * batch_size # feel free to increase
# epoch_save_frequency 0 = never, 1 = every mini-epoch, 2 = every two mini-epochs, ...
epoch_save_frequency = 10
epoch_save_path = 'trained-'
MODEL_NAME = epoch_save_path + str(EPOCH_BEGIN)
########################################################################################################
if EXPRESS_PILE_MODE:
if EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-169M':
lr_init = 2e-5
else:
lr_init = 1e-5
lr_final = 1e-5
n_epoch = 100000
### misc stuffs ########################################################################################
if LOAD_MODEL and EPOCH_BEGIN > 0: # we are not saving gradients, so let's have some warmup if we load a model
warmup_tokens = 50 * ctx_len * batch_size // NUM_GPUS
else:
warmup_tokens = 0
betas = (0.9, 0.99)
betas = (0.9, 0.99) # set betas = (0.9, 0.999) if your model has been trained for a while
eps = 1e-8
num_workers = 1 # DataLoader worker. I only tested num_workers = 1
NUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])
os.environ['RWKV_LOAD_MODEL'] = str(LOAD_MODEL)
MODEL_NAME = epoch_save_path + str(EPOCH_BEGIN)
if EXPRESS_PILE_MODE:
betas = (0.9, 0.999)
MODEL_NAME = EXPRESS_PILE_MODEL_NAME
########################################################################################################
# Load data
########################################################################################################
print('loading data... ' + datafile)
if datafile_encoding != 'binidx':
train_dataset = Dataset(open(
datafile, "r", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed)
else:
print(f'loading {datafile_encoding} data... ' + datafile)
if datafile_encoding == 'binidx':
train_dataset = Dataset(MMapIndexedDataset(datafile), ctx_len, epoch_length_fixed)
elif datafile_encoding == 'numpy':
train_dataset = Dataset(np.load(datafile).astype('int'), ctx_len, epoch_length_fixed)
else:
train_dataset = Dataset(open(datafile, "r", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed)
########################################################################################################
# Train model
########################################################################################################
if __name__ == '__main__':
from src.trainer import Trainer, TrainerConfig
@ -180,12 +229,16 @@ if __name__ == '__main__':
"min_loss_scale": 1
}
trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator="gpu", precision=16)
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
DEEPSPEED_CFG["bf16"] = {
"enabled": True
}
trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator="gpu", precision='bf16')
elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator="gpu", precision=32)
print(trainer._strategy.config)
trainer.run(m_cfg, train_dataset, None, tconf)

@ -17,15 +17,29 @@ import torch
from src.model_run import RWKV_RNN, RWKV_GPT
from src.model import GPT, GPTConfig
TOKEN_MODE = 'pile' # char / pile
if TOKEN_MODE == 'char':
MODEL_NAME = 'trained-1'
WORD_NAME = 'vocab' # the .json vocab (generated by train.py)
ctx_len = 1024
n_layer = 6
n_embd = 512
model_type = 'RWKV'
UNKNOWN_CHAR = ' ' # here we just set it to [space] for simplicity
elif TOKEN_MODE == 'pile':
WORD_NAME = ['20B_tokenizer.json', '20B_tokenizer.json']
MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'
ctx_len = 1024
n_layer = 12
n_embd = 768
UNKNOWN_CHAR = None
model_name = 'trained-1'
model_type = 'RWKV'
from src.utils import TOKENIZER
tokenizer = TOKENIZER('vocab', UNKNOWN_CHAR=' ')
tokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)
if TOKEN_MODE == 'pile':
tokenizer.vocab_size = 50277
########################################################################################################
@ -36,17 +50,22 @@ if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
model_train = model_train.bfloat16()
print('loading ' + model_name)
m2 = torch.load(model_name + '.pth', map_location=RUN_DEVICE)
print('loading ' + MODEL_NAME)
m2 = torch.load(MODEL_NAME + '.pth', map_location=RUN_DEVICE)
model_train.load_state_dict(m2)
model_rnn = RWKV_RNN(model_name, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)
model_gpt = RWKV_GPT(model_name, RUN_DEVICE, model_type, tokenizer.vocab_size, n_layer, n_embd, ctx_len).cuda()
model_rnn = RWKV_RNN(MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)
model_gpt = RWKV_GPT(MODEL_NAME, RUN_DEVICE, model_type, tokenizer.vocab_size, n_layer, n_embd, ctx_len).cuda()
########################################################################################################
context = '\nIn a'
# context = '\nIn a'
context = '\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'
if TOKEN_MODE == 'char':
ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]
elif TOKEN_MODE == 'pile':
ctx = tokenizer.tokenizer.encode(context)
print(f'input len {len(ctx)} data {ctx}')
########################################################################################################
@ -67,5 +86,5 @@ for i in range(src_len):
print('...')
print('\nRWKV-train output')
out = model_train.forward(torch.tensor([ctx]).cuda())[0][0].detach().cpu().numpy()
out = model_train.forward(torch.tensor([ctx]).cuda())[0][0].detach().cpu().float().numpy()
print(out, '\n')

Loading…
Cancel
Save