supports RWKV-4 pile models

main
BlinkDL 3 years ago
parent 61b7c429df
commit 68c486ad10

File diff suppressed because it is too large Load Diff

@ -1,37 +0,0 @@
{
"zero_allow_untested_optimizer":true,
"zero_optimization":{
"stage":2,
"contiguous_gradients":true,
"overlap_comm":true,
"allgather_partitions":true,
"reduce_scatter":true,
"allgather_bucket_size":200000000,
"reduce_bucket_size":200000000,
"sub_group_size":1000000000000
},
"activation_checkpointing":{
"partition_activations":false,
"cpu_checkpointing":false,
"contiguous_memory_optimization":false,
"synchronize_checkpoint_boundary":false
},
"aio":{
"block_size":1048576,
"queue_depth":8,
"single_submit":false,
"overlap_events":true,
"thread_count":1
},
"gradient_clipping": 1.0,
"gradient_accumulation_steps": 1,
"fp16": {
"fp16": true,
"enabled": true,
"loss_scale": 0,
"initial_scale_power": 12,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
}
}

@ -15,70 +15,94 @@ torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cuda.matmul.allow_tf32 = True
np.set_printoptions(precision=4, suppress=True, linewidth=200) np.set_printoptions(precision=4, suppress=True, linewidth=200)
### Step 1: set model ################################################################################## ########################################################################################################
# Step 1: set model
#
# Set TOKEN_MODE to 'char' or 'bpe' if the model is trained by 'train.py' from scratch.
#
# Set TOKEN_MODE to 'pile' if you want to test pre-trained pile models.
########################################################################################################
os.environ['RWKV_FLOAT_MODE'] = 'bf16' # 'bf16' or 'fp16' TOKEN_MODE = 'char' # char / bpe / pile
os.environ['RWKV_RUN_DEVICE'] = 'cpu' # 'cpu' (already very fast) or 'cuda'
RUN_DEVICE = os.environ['RWKV_RUN_DEVICE']
ctx_len = 1024
n_layer = 6 n_layer = 6
n_embd = 512 n_embd = 512
model_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre' ctx_len = 1024
### Step 2: set vocab & context ########################################################################
CHAR_MODE = True # True False
if CHAR_MODE: if TOKEN_MODE == 'char':
### example 1: char-level model
MODEL_NAME = 'trained-500' # your trained model MODEL_NAME = 'trained-500' # your trained model
WORD_NAME = 'vocab' # the .json vocab (generated by train.py) WORD_NAME = 'vocab' # the .json vocab (generated by train.py)
# --> set UNKNOWN_CHAR to the rarest token in your vocab.json <-- # set UNKNOWN_CHAR to the rarest token in your vocab.json, and all unknown tokens in your prompt will be denoted by it
# --> all unknown tokens in your context will be denoted by it <-- UNKNOWN_CHAR = ' ' # here we just set it to ' ' for simplicity
UNKNOWN_CHAR = ' ' # here we just set it to [space] for simplicity
context = "\nIn the" # your prompt elif TOKEN_MODE == 'bpe':
else: MODEL_NAME = 'trained-500' # your trained model
### example 2: BPE-level model WORD_NAME = ['model-vocab.json', 'model-merges.txt'] # [vocab, merge] for your BPE model
MODEL_NAME = 'trained-7773'
WORD_NAME = ['model-vocab.json', 'model-merges.txt'] # [vocab, merge]
UNKNOWN_CHAR = None UNKNOWN_CHAR = None
context = 'A'
### Step 3: other config ############################################################################### elif TOKEN_MODE == 'pile':
WORD_NAME = ['20B_tokenizer.json', '20B_tokenizer.json']
UNKNOWN_CHAR = None
DEBUG_DEBUG = False # True False - show softmax output #---> you can set MODEL_NAME to your fine-tuned model <---
MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'
# MODEL_NAME = 'trained-11'
n_layer = 12
n_embd = 768
ctx_len = 1024
# MODEL_NAME = 'RWKV-4-Pile-430M-20220808-8066'
# n_layer = 24
# n_embd = 1024
# ctx_len = 1024
os.environ['RWKV_FLOAT_MODE'] = 'fp32' # 'bf16' / 'fp16' / 'fp32' (note: only using fp32 at this moment)
os.environ['RWKV_RUN_DEVICE'] = 'cpu' # 'cpu' (already very fast) or 'cuda'
model_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre'
########################################################################################################
# Step 2: set prompt & sampling stuffs
########################################################################################################
# context = 'A'
# context = "\nIn the"
# context = '\nSugar:'
context = '\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'
NUM_TRIALS = 999 NUM_TRIALS = 999
LENGTH_PER_TRIAL = 500 LENGTH_PER_TRIAL = 333
TEMPERATURE = 1.0 TEMPERATURE = 1.0
top_p = 0.7 top_p = 0.7
top_p_newline = 0.9 top_p_newline = 0.9 # only used in TOKEN_MODE = char
DEBUG_DEBUG = False # True False --> show softmax output
######################################################################################################## ########################################################################################################
print(f'Loading {MODEL_NAME}...') print(f'Loading {MODEL_NAME}...')
from src.model_run import RWKV_RNN from src.model_run import RWKV_RNN
model = RWKV_RNN(MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len) model = RWKV_RNN(MODEL_NAME, os.environ['RWKV_RUN_DEVICE'], model_type, n_layer, n_embd, ctx_len)
tokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR) tokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)
######################################################################################################## ########################################################################################################
context = tokenizer.refine_context(context) if tokenizer.charMode:
print('\nYour prompt has ' + str(len(context)) + ' tokens.') context = tokenizer.refine_context(context)
ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]
else:
ctx = tokenizer.tokenizer.encode(context)
src_len = len(ctx)
src_ctx = ctx.copy()
print('\nYour prompt has ' + str(src_len) + ' tokens.')
print('\n--> Currently the first run takes a while if your prompt is long, as we are using RNN to process the prompt. Use GPT to build the hidden state for better speed. <--\n') print('\n--> Currently the first run takes a while if your prompt is long, as we are using RNN to process the prompt. Use GPT to build the hidden state for better speed. <--\n')
for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS): for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):
t_begin = time.time_ns() t_begin = time.time_ns()
src_len = len(context)
if tokenizer.charMode:
ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]
else:
ctx = tokenizer.tokenizer.encode(context)
print(('-' * 30) + context, end='') print(('-' * 30) + context, end='')
ctx = src_ctx.copy()
model.clear() model.clear()
if TRIAL == 0: if TRIAL == 0:
init_state = types.SimpleNamespace() init_state = types.SimpleNamespace()
@ -104,6 +128,9 @@ for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):
print('model', np.array(x), '==>', np.array( print('model', np.array(x), '==>', np.array(
out), np.max(out), np.min(out)) out), np.max(out), np.min(out))
if TOKEN_MODE == 'pile':
out[0] = -999999999 # disable <|endoftext|>
char = tokenizer.sample_logits(out, x, ctx_len, temperature=TEMPERATURE, char = tokenizer.sample_logits(out, x, ctx_len, temperature=TEMPERATURE,
top_p_usual=top_p, top_p_newline=top_p_newline) top_p_usual=top_p, top_p_newline=top_p_newline)
char = char.item() char = char.item()
@ -112,5 +139,6 @@ for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):
else: else:
print(tokenizer.tokenizer.decode(int(char)), end='', flush=True) print(tokenizer.tokenizer.decode(int(char)), end='', flush=True)
ctx += [char] ctx += [char]
t_end = time.time_ns() t_end = time.time_ns()
print("\n----------", round((t_end - t_begin) / (10 ** 9), 2), end='s ') print("\n----------", round((t_end - t_begin) / (10 ** 9), 2), end='s ')

@ -26,74 +26,58 @@ from torch.utils.cpp_extension import load
wkv_cuda = load(name="wkv", sources=["cuda/wkv_op.cpp", "cuda/wkv_cuda.cu"], wkv_cuda = load(name="wkv", sources=["cuda/wkv_op.cpp", "cuda/wkv_cuda.cu"],
verbose=True, extra_cuda_cflags=['--use_fast_math', '--extra-device-vectorization', f'-DTmax={T_MAX}']) verbose=True, extra_cuda_cflags=['--use_fast_math', '--extra-device-vectorization', f'-DTmax={T_MAX}'])
if os.environ['RWKV_FLOAT_MODE'] == 'fp16': class WKV(torch.autograd.Function):
class WKV(torch.autograd.Function): @staticmethod
@staticmethod def forward(ctx, B, T, C, w, u, k, v):
def forward(ctx, B, T, C, w, u, k, v): ctx.B = B
ctx.B = B ctx.T = T
ctx.T = T ctx.C = C
ctx.C = C assert T <= T_MAX
assert T <= T_MAX assert B * C % min(C, 1024) == 0
assert B * C % min(C, 1024) == 0 if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
w = -torch.exp(w.float().contiguous()) w = -torch.exp(w.float().contiguous())
u = u.float().contiguous() u = u.float().contiguous()
k = k.float().contiguous() k = k.float().contiguous()
v = v.float().contiguous() v = v.float().contiguous()
ctx.save_for_backward(w, u, k, v) else:
y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format) w = -torch.exp(w.contiguous())
wkv_cuda.forward(B, T, C, w, u, k, v, y) u = u.contiguous()
k = k.contiguous()
v = v.contiguous()
ctx.save_for_backward(w, u, k, v)
y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
wkv_cuda.forward(B, T, C, w, u, k, v, y)
if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
return y.half() return y.half()
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
@staticmethod
def backward(ctx, gy):
B = ctx.B
T = ctx.T
C = ctx.C
assert T <= T_MAX
assert B * C % min(C, 1024) == 0
w, u, k, v = ctx.saved_tensors
gw = torch.zeros((B, C), device='cuda')
gu = torch.zeros((B, C), device='cuda')
gk = torch.zeros((B, T, C), device='cuda')
gv = torch.zeros((B, T, C), device='cuda')
wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
gw = torch.sum(gw, dim=0)
gu = torch.sum(gu, dim=0)
return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
class WKV(torch.autograd.Function):
@staticmethod
def forward(ctx, B, T, C, w, u, k, v):
ctx.B = B
ctx.T = T
ctx.C = C
assert T <= T_MAX
assert B * C % min(C, 1024) == 0
w = -torch.exp(w.float().contiguous())
u = u.float().contiguous()
k = k.float().contiguous()
v = v.float().contiguous()
ctx.save_for_backward(w, u, k, v)
y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
wkv_cuda.forward(B, T, C, w, u, k, v, y)
return y.bfloat16() return y.bfloat16()
elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
@staticmethod return y
def backward(ctx, gy):
B = ctx.B @staticmethod
T = ctx.T def backward(ctx, gy):
C = ctx.C B = ctx.B
assert T <= T_MAX T = ctx.T
assert B * C % min(C, 1024) == 0 C = ctx.C
w, u, k, v = ctx.saved_tensors assert T <= T_MAX
gw = torch.zeros((B, C), device='cuda') assert B * C % min(C, 1024) == 0
gu = torch.zeros((B, C), device='cuda') w, u, k, v = ctx.saved_tensors
gk = torch.zeros((B, T, C), device='cuda') gw = torch.zeros((B, C), device='cuda')
gv = torch.zeros((B, T, C), device='cuda') gu = torch.zeros((B, C), device='cuda')
gk = torch.zeros((B, T, C), device='cuda')
gv = torch.zeros((B, T, C), device='cuda')
if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv) wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
gw = torch.sum(gw, dim=0) else:
gu = torch.sum(gu, dim=0) wkv_cuda.backward(B, T, C, w, u, k, v, gy.contiguous(), gw, gu, gk, gv)
gw = torch.sum(gw, dim=0)
gu = torch.sum(gu, dim=0)
if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
return (None, None, None, gw.bfloat16(), gu.bfloat16(), gk.bfloat16(), gv.bfloat16()) return (None, None, None, gw.bfloat16(), gu.bfloat16(), gk.bfloat16(), gv.bfloat16())
elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
return (None, None, None, gw, gu, gk, gv)
def RUN_CUDA(B, T, C, w, u, k, v): def RUN_CUDA(B, T, C, w, u, k, v):
return WKV.apply(B, T, C, w.cuda(), u.cuda(), k.cuda(), v.cuda()) return WKV.apply(B, T, C, w.cuda(), u.cuda(), k.cuda(), v.cuda())
@ -376,6 +360,8 @@ class GPT(nn.Module):
c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).half() c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).half()
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16': elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).bfloat16() c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).bfloat16()
elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
c = c @ F.one_hot(idx, num_classes=self.config.vocab_size)
x = self.head(x) + c x = self.head(x) + c
else: else:

@ -26,74 +26,58 @@ if os.environ['RWKV_RUN_DEVICE'] == 'cuda':
wkv_cuda = load(name="wkv", sources=["cuda/wkv_op.cpp", "cuda/wkv_cuda.cu"], wkv_cuda = load(name="wkv", sources=["cuda/wkv_op.cpp", "cuda/wkv_cuda.cu"],
verbose=True, extra_cuda_cflags=['--use_fast_math', '--extra-device-vectorization', f'-DTmax={T_MAX}']) verbose=True, extra_cuda_cflags=['--use_fast_math', '--extra-device-vectorization', f'-DTmax={T_MAX}'])
if os.environ['RWKV_FLOAT_MODE'] == 'fp16': class WKV(torch.autograd.Function):
class WKV(torch.autograd.Function): @staticmethod
@staticmethod def forward(ctx, B, T, C, w, u, k, v):
def forward(ctx, B, T, C, w, u, k, v): ctx.B = B
ctx.B = B ctx.T = T
ctx.T = T ctx.C = C
ctx.C = C assert T <= T_MAX
assert T <= T_MAX assert B * C % min(C, 1024) == 0
assert B * C % min(C, 1024) == 0 if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
w = -torch.exp(w.float().contiguous()) w = -torch.exp(w.float().contiguous())
u = u.float().contiguous() u = u.float().contiguous()
k = k.float().contiguous() k = k.float().contiguous()
v = v.float().contiguous() v = v.float().contiguous()
ctx.save_for_backward(w, u, k, v) else:
y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format) w = -torch.exp(w.contiguous())
wkv_cuda.forward(B, T, C, w, u, k, v, y) u = u.contiguous()
k = k.contiguous()
v = v.contiguous()
ctx.save_for_backward(w, u, k, v)
y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
wkv_cuda.forward(B, T, C, w, u, k, v, y)
if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
return y.half() return y.half()
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
@staticmethod
def backward(ctx, gy):
B = ctx.B
T = ctx.T
C = ctx.C
assert T <= T_MAX
assert B * C % min(C, 1024) == 0
w, u, k, v = ctx.saved_tensors
gw = torch.zeros((B, C), device='cuda')
gu = torch.zeros((B, C), device='cuda')
gk = torch.zeros((B, T, C), device='cuda')
gv = torch.zeros((B, T, C), device='cuda')
wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
gw = torch.sum(gw, dim=0)
gu = torch.sum(gu, dim=0)
return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
class WKV(torch.autograd.Function):
@staticmethod
def forward(ctx, B, T, C, w, u, k, v):
ctx.B = B
ctx.T = T
ctx.C = C
assert T <= T_MAX
assert B * C % min(C, 1024) == 0
w = -torch.exp(w.float().contiguous())
u = u.float().contiguous()
k = k.float().contiguous()
v = v.float().contiguous()
ctx.save_for_backward(w, u, k, v)
y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
wkv_cuda.forward(B, T, C, w, u, k, v, y)
return y.bfloat16() return y.bfloat16()
elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
@staticmethod return y
def backward(ctx, gy):
B = ctx.B @staticmethod
T = ctx.T def backward(ctx, gy):
C = ctx.C B = ctx.B
assert T <= T_MAX T = ctx.T
assert B * C % min(C, 1024) == 0 C = ctx.C
w, u, k, v = ctx.saved_tensors assert T <= T_MAX
gw = torch.zeros((B, C), device='cuda') assert B * C % min(C, 1024) == 0
gu = torch.zeros((B, C), device='cuda') w, u, k, v = ctx.saved_tensors
gk = torch.zeros((B, T, C), device='cuda') gw = torch.zeros((B, C), device='cuda')
gv = torch.zeros((B, T, C), device='cuda') gu = torch.zeros((B, C), device='cuda')
gk = torch.zeros((B, T, C), device='cuda')
gv = torch.zeros((B, T, C), device='cuda')
if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv) wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
gw = torch.sum(gw, dim=0) else:
gu = torch.sum(gu, dim=0) wkv_cuda.backward(B, T, C, w, u, k, v, gy.contiguous(), gw, gu, gk, gv)
gw = torch.sum(gw, dim=0)
gu = torch.sum(gu, dim=0)
if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
return (None, None, None, gw.bfloat16(), gu.bfloat16(), gk.bfloat16(), gv.bfloat16()) return (None, None, None, gw.bfloat16(), gu.bfloat16(), gk.bfloat16(), gv.bfloat16())
elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
return (None, None, None, gw, gu, gk, gv)
def RUN_CUDA(B, T, C, w, u, k, v): def RUN_CUDA(B, T, C, w, u, k, v):
return WKV.apply(B, T, C, w.cuda(), u.cuda(), k.cuda(), v.cuda()) return WKV.apply(B, T, C, w.cuda(), u.cuda(), k.cuda(), v.cuda())

@ -19,6 +19,21 @@ torch.backends.cudnn.benchmark = True
torch.backends.cudnn.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cuda.matmul.allow_tf32 = True
class L2Wrap(torch.autograd.Function):
@staticmethod
def forward(ctx, loss, y):
ctx.save_for_backward(y)
return loss
@staticmethod
def backward(ctx, grad_output):
y = ctx.saved_tensors[0]
# to encourage the logits to be close to 0
factor = 1e-4 / (y.shape[0] * y.shape[1])
maxx, ids = torch.max(y, -1, keepdim=True)
gy = torch.zeros_like(y)
gy.scatter_(-1, ids, maxx * factor)
return (grad_output, gy)
class TrainerConfig: class TrainerConfig:
batch_size = 64 batch_size = 64
learning_rate = 4e-4 learning_rate = 4e-4
@ -109,14 +124,15 @@ class Trainer(LightningLite):
for it, (x, y) in pbar: for it, (x, y) in pbar:
with torch.set_grad_enabled(is_train): with torch.set_grad_enabled(is_train):
_, loss = model(x, y) # forward the model yyy, loss = model(x, y) # forward the model
lossL2 = L2Wrap.apply(loss, yyy)
all_loss = [loss.clone() for _ in range(NUM_GPUS)] all_loss = [loss.clone() for _ in range(NUM_GPUS)]
torch.distributed.all_gather(all_loss, loss) torch.distributed.all_gather(all_loss, loss)
if is_train: # backprop and update the parameters if is_train: # backprop and update the parameters
model.zero_grad() model.zero_grad()
self.backward(loss) self.backward(lossL2)
# deepspeed will handle gradient_clipping # deepspeed will handle gradient_clipping

@ -22,13 +22,19 @@ class Dataset(Dataset):
self.data = data self.data = data
if 'MMapIndexedDataset' in str(type(self.data)): if 'MMapIndexedDataset' in str(type(self.data)):
self.vocab_size = 253 # your vocab_size self.vocab_size = int(os.environ['VOCAB_SIZE'])
print('current vocab size = ', self.vocab_size, "(make sure it's correct)") print('current vocab size =', self.vocab_size, "(make sure it's correct)")
self.data_size = len(self.data._bin_buffer) // 2 self.data_size = len(self.data._bin_buffer) // 2
self.item_cnt = len(self.data) print(f'data has {self.data_size} tokens.')
elif 'numpy' in str(type(self.data)):
self.vocab_size = int(os.environ['VOCAB_SIZE'])
print('current vocab size =', self.vocab_size, "(make sure it's correct)")
self.data_size = len(self.data)
print(f'data has {self.data_size} tokens.')
else: else:
print('building token list...', end=' ') print('building token list...', end=' ')
unique = sorted(list(set(data))) unique = sorted(list(set(data)))
self.vocab_size = len(unique)
# print() # print()
# for u in unique: # for u in unique:
# print(u, end=' ') # print(u, end=' ')
@ -41,25 +47,25 @@ class Dataset(Dataset):
xx += 1 xx += 1
with open('vocab.json', "w", encoding="utf-16") as vocab_file: with open('vocab.json', "w", encoding="utf-16") as vocab_file:
vocab_file.write(json.dumps(xxObj, ensure_ascii=False)) vocab_file.write(json.dumps(xxObj, ensure_ascii=False))
self.data_size = len(self.data)
data_size, vocab_size = len(data), len(unique) print('data has %d tokens, %d unique.' % (self.data_size, self.vocab_size))
print('data has %d tokens, %d unique.' % (data_size, vocab_size))
self.stoi = {ch: i for i, ch in enumerate(unique)} self.stoi = {ch: i for i, ch in enumerate(unique)}
self.itos = {i: ch for i, ch in enumerate(unique)} self.itos = {i: ch for i, ch in enumerate(unique)}
self.vocab_size = vocab_size
def __len__(self): def __len__(self):
return self.epoch_length_fixed // NUM_GPUS return self.epoch_length_fixed // NUM_GPUS
def __getitem__(self, idx): def __getitem__(self, idx):
# cheat: pick a random spot in dataset #
# we are cheating: pick a random spot in dataset
#
i = np.random.randint(0, self.data_size - (self.ctx_len + 1))
if 'MMapIndexedDataset' in str(type(self.data)): if 'MMapIndexedDataset' in str(type(self.data)):
i = np.random.randint(0, self.data_size - (self.ctx_len + 1))
dix = self.data.get(idx=0, offset=i, length=self.ctx_len + 1).astype(int) dix = self.data.get(idx=0, offset=i, length=self.ctx_len + 1).astype(int)
elif 'numpy' in str(type(self.data)):
dix = self.data[i:i+self.ctx_len+1]
else: else:
i = np.random.randint(0, len(self.data) - (self.ctx_len + 1)) dix = [self.stoi[s] for s in self.data[i:i+self.ctx_len+1]]
chunk = self.data[i:i+self.ctx_len+1]
dix = [self.stoi[s] for s in chunk]
x = torch.tensor(dix[:-1], dtype=torch.long) x = torch.tensor(dix[:-1], dtype=torch.long)
y = torch.tensor(dix[1:], dtype=torch.long) y = torch.tensor(dix[1:], dtype=torch.long)
@ -70,8 +76,12 @@ class TOKENIZER():
def __init__(self, WORD_NAME, UNKNOWN_CHAR='\ue083'): def __init__(self, WORD_NAME, UNKNOWN_CHAR='\ue083'):
if 'list' in str(type(WORD_NAME)): if 'list' in str(type(WORD_NAME)):
self.charMode = False self.charMode = False
from transformers import GPT2TokenizerFast if WORD_NAME[0] == WORD_NAME[1]:
self.tokenizer = GPT2TokenizerFast(WORD_NAME[0], WORD_NAME[1]) from transformers import PreTrainedTokenizerFast
self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=WORD_NAME[0])
else:
from transformers import GPT2TokenizerFast
self.tokenizer = GPT2TokenizerFast(WORD_NAME[0], WORD_NAME[1])
else: else:
self.charMode = True self.charMode = True
with open(WORD_NAME + '.json', "r", encoding="utf-16") as result_file: with open(WORD_NAME + '.json', "r", encoding="utf-16") as result_file:
@ -85,15 +95,13 @@ class TOKENIZER():
self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR] self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR]
def refine_context(self, context): def refine_context(self, context):
if self.charMode: context = context.strip().split('\n')
context = context.strip().split('\n') for c in range(len(context)):
for c in range(len(context)): context[c] = context[c].strip().strip('\u3000').strip('\r')
context[c] = context[c].strip().strip('\u3000').strip('\r') context = list(filter(lambda c: c != '', context))
context = list(filter(lambda c: c != '', context)) context = '\n' + ('\n'.join(context)).strip()
context = '\n' + ('\n'.join(context)).strip() if context == '':
if context == '': context = '\n'
context = '\n'
return context return context
def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None): def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None):

@ -3,39 +3,11 @@
######################################################################################################## ########################################################################################################
import os import os
os.environ['USE_WANDB'] = '0' # 0 = False, 1 = True
os.environ['RWKV_FLOAT_MODE'] = 'bf16' # 'bf16' (stable) or 'fp16' (will overflow after training a large model for very long. can be solved in the future)
### This is using DeepSpeed stage2 + FP16 ##############################################################
#
# Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training:
# 1) leave RWKV_NUM_GPUS = '1' and let it run for 1 'mini-epoch' and it will save a 'trained-1.pth'
# 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = NUM_GPUS * single_gpu_batchsz,
# EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training
#
os.environ['RWKV_NUM_GPUS'] = '1' # num of GPUs to use
NUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])
### Change these if you want to continue training from a saved model ###################################
EPOCH_BEGIN = 0
LOAD_MODEL = False # shall we continue from the #EPOCH_BEGIN model?
os.environ['RWKV_LOAD_MODEL'] = str(LOAD_MODEL)
########################################################################################################
# if False: # True False ---> Set to False if you don't understand it
# print("\n\n[[[ SPECIAL DEBUG MODE FOR MYSELF. DON'T ENABLE THIS IF YOU DON'T UNDERSTAND IT ]]]\n\n")
# import src.utils
# src.utils.set_seed(42) # make training deterministic (including dataloader). if you are doing this, remember to change seed when you load a model (otherwise the dataloader loads old samples)
import logging, types import logging, types
from src.utils import Dataset from src.utils import Dataset
import torch import torch
import numpy as np import numpy as np
from src.binidx import MMapIndexedDataset # for the Megatron-LM 'binidx' format from src.binidx import MMapIndexedDataset
np.set_printoptions(precision=4, suppress=True, linewidth=200) np.set_printoptions(precision=4, suppress=True, linewidth=200)
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@ -44,35 +16,95 @@ torch.backends.cudnn.benchmark = True
torch.backends.cudnn.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cuda.matmul.allow_tf32 = True
### Step 1: set training data ########################################################################## # if False: # True False ---> Set to False if you don't understand it
# print("\n\n[[[ SPECIAL DEBUG MODE FOR MYSELF. DON'T ENABLE THIS IF YOU DON'T UNDERSTAND IT ]]]\n\n")
# import src.utils
# src.utils.set_seed(42) # make training deterministic (including dataloader). if you are doing this, remember to change seed when you load a model (otherwise the dataloader loads old samples)
########################################################################################################
# Step 1: set training data & cfg
########################################################################################################
EXPRESS_PILE_MODE = False # True: express mode for fine-tuning a pile model // False: usual training
EXPRESS_PILE_MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'
EXPRESS_PILE_MODEL_TYPE = 'RWKV-4-Pile-169M'
# EXPRESS_PILE_MODEL_NAME = 'RWKV-4-Pile-430M-20220808-8066'
# EXPRESS_PILE_MODEL_TYPE = 'RWKV-4-Pile-430M'
########################################################################################################
datafile = "../data/enwik8" # your data datafile = "../data/enwik8" # your data
datafile_encoding = 'utf-8' # 'utf-8' 'utf-16le' 'binidx' datafile_encoding = 'utf-8' # 'utf-8' / 'utf-16le' / 'numpy' (for fine-tuning pile models) / 'binidx' (the Megatron-LM 'binidx' format)
# datafile = './my-gpt_seq_document' # datafile = 'my-gpt_seq_document'
# datafile_encoding = 'binidx' # datafile_encoding = 'binidx'
### Step 2: set model size ############################################################################# if EXPRESS_PILE_MODE:
datafile = 'train.npy' # use 'prepare-data.py' in https://github.com/BlinkDL/RWKV-v2-RNN-Pile/tree/main/RWKV-v3 to tokenize .txt into .npy
datafile_encoding = 'numpy'
#
# set VOCAB_SIZE = 0 (auto-compute) if you are training a char-level LM from scratch
# set VOCAB_SIZE = 50277 for fine-tuning pile models
# set VOCAB_SIZE = your_vocab_size for 'binidx' data
#
os.environ['VOCAB_SIZE'] = '0'
if EXPRESS_PILE_MODE:
os.environ['VOCAB_SIZE'] = '50277'
#
# Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training:
# 1) set RWKV_NUM_GPUS = '1' and let it run for 1 miniEpoch and it will save a trained-1.pth
# 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = single_gpu_batchsz * RWKV_NUM_GPUS,
# EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training from it
#
os.environ['RWKV_NUM_GPUS'] = '1' # num of GPUs to use
os.environ['RWKV_FLOAT_MODE'] = 'bf16' # 'bf16' (stable) or 'fp16' (will overflow after training a large model for very long. can be solved in the future) or 'fp32'
os.environ['USE_WANDB'] = '0' # wandb logging. 0 = False, 1 = True
########################################################################################################
# Step 2: set model details
########################################################################################################
EPOCH_BEGIN = 0 # begins with miniEpoch = EPOCH_BEGIN
LOAD_MODEL = False # shall we load the #EPOCH_BEGIN model and continue the training from it?
ctx_len = 1024 # increase T_MAX in model.py if your ctx_len is very long
n_layer = 6 n_layer = 6
n_embd = 512 n_embd = 512
ctx_len = 1024 # increase T_MAX in src/model.py if your ctx_len is very long
# 'RWKV' or 'RWKV-ffnPre' (better in some cases) model_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre' (sometimes better)
model_type = 'RWKV'
# ---> there is also a RWKV_HEAD_QK_DIM in model.py and model_run.py <--- # there is also a RWKV_HEAD_QK_DIM in model.py and model_run.py
# set it to 256, then it's using my headQK trick (similar to a tiny attention) to improve loss # set it to 256, then it's using my headQK trick (a tiny attention) to improve loss
# set it to 0, then it's a pure RNN (attention-free) # set it to 0, then it's a pure RNN (attention-free)
### Step 3: set batch size ############################################################################# if EXPRESS_PILE_MODE:
LOAD_MODEL = True
if EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-169M':
n_layer = 12
n_embd = 768
ctx_len = 1024
elif EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-430M':
n_layer = 24
n_embd = 1024
ctx_len = 1024
elif EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-1B5':
n_layer = 24
n_embd = 2048
ctx_len = 1024
########################################################################################################
# Step 3: set batch size & learning rate etc.
########################################################################################################
# if you see "CUDA out of memory", reduce batch_size. Use nvidia-smi to find the highest value for your GPU. # if you see "CUDA out of memory", reduce batch_size. Use nvidia-smi to find the highest value for your GPU.
batch_size = 12 * NUM_GPUS batch_size = 12 * int(os.environ['RWKV_NUM_GPUS'])
assert (batch_size % NUM_GPUS == 0) assert (batch_size % int(os.environ['RWKV_NUM_GPUS']) == 0)
### Step 4: set learning rate, number of mini-epochs #######################################################
#
# By default we are using exponential LR decay. # By default we are using exponential LR decay.
# Here are my suggestions for training. # Here are my suggestions for training.
# Let's say you are training a L6-D512 model. # Let's say you are training a L6-D512 model.
@ -93,34 +125,51 @@ epoch_length_fixed = (10000 // batch_size) * batch_size # feel free to increase
# epoch_save_frequency 0 = never, 1 = every mini-epoch, 2 = every two mini-epochs, ... # epoch_save_frequency 0 = never, 1 = every mini-epoch, 2 = every two mini-epochs, ...
epoch_save_frequency = 10 epoch_save_frequency = 10
epoch_save_path = 'trained-' epoch_save_path = 'trained-'
MODEL_NAME = epoch_save_path + str(EPOCH_BEGIN)
######################################################################################################## if EXPRESS_PILE_MODE:
if EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-169M':
lr_init = 2e-5
else:
lr_init = 1e-5
lr_final = 1e-5
n_epoch = 100000
### misc stuffs ########################################################################################
if LOAD_MODEL and EPOCH_BEGIN > 0: # we are not saving gradients, so let's have some warmup if we load a model if LOAD_MODEL and EPOCH_BEGIN > 0: # we are not saving gradients, so let's have some warmup if we load a model
warmup_tokens = 50 * ctx_len * batch_size // NUM_GPUS warmup_tokens = 50 * ctx_len * batch_size // NUM_GPUS
else: else:
warmup_tokens = 0 warmup_tokens = 0
betas = (0.9, 0.99) betas = (0.9, 0.99) # set betas = (0.9, 0.999) if your model has been trained for a while
eps = 1e-8 eps = 1e-8
num_workers = 1 # DataLoader worker. I only tested num_workers = 1 num_workers = 1 # DataLoader worker. I only tested num_workers = 1
NUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])
os.environ['RWKV_LOAD_MODEL'] = str(LOAD_MODEL)
MODEL_NAME = epoch_save_path + str(EPOCH_BEGIN)
if EXPRESS_PILE_MODE:
betas = (0.9, 0.999)
MODEL_NAME = EXPRESS_PILE_MODEL_NAME
######################################################################################################## ########################################################################################################
# Load data # Load data
######################################################################################################## ########################################################################################################
print('loading data... ' + datafile) print(f'loading {datafile_encoding} data... ' + datafile)
if datafile_encoding != 'binidx': if datafile_encoding == 'binidx':
train_dataset = Dataset(open(
datafile, "r", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed)
else:
train_dataset = Dataset(MMapIndexedDataset(datafile), ctx_len, epoch_length_fixed) train_dataset = Dataset(MMapIndexedDataset(datafile), ctx_len, epoch_length_fixed)
elif datafile_encoding == 'numpy':
train_dataset = Dataset(np.load(datafile).astype('int'), ctx_len, epoch_length_fixed)
else:
train_dataset = Dataset(open(datafile, "r", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed)
######################################################################################################## ########################################################################################################
# Train model # Train model
######################################################################################################## ########################################################################################################
if __name__ == '__main__': if __name__ == '__main__':
from src.trainer import Trainer, TrainerConfig from src.trainer import Trainer, TrainerConfig
@ -180,12 +229,16 @@ if __name__ == '__main__':
"min_loss_scale": 1 "min_loss_scale": 1
} }
trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator="gpu", precision=16) trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator="gpu", precision=16)
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16': elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
DEEPSPEED_CFG["bf16"] = { DEEPSPEED_CFG["bf16"] = {
"enabled": True "enabled": True
} }
trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator="gpu", precision='bf16') trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator="gpu", precision='bf16')
elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator="gpu", precision=32)
print(trainer._strategy.config) print(trainer._strategy.config)
trainer.run(m_cfg, train_dataset, None, tconf) trainer.run(m_cfg, train_dataset, None, tconf)

@ -17,15 +17,29 @@ import torch
from src.model_run import RWKV_RNN, RWKV_GPT from src.model_run import RWKV_RNN, RWKV_GPT
from src.model import GPT, GPTConfig from src.model import GPT, GPTConfig
ctx_len = 1024 TOKEN_MODE = 'pile' # char / pile
n_layer = 6
n_embd = 512 if TOKEN_MODE == 'char':
model_type = 'RWKV' MODEL_NAME = 'trained-1'
WORD_NAME = 'vocab' # the .json vocab (generated by train.py)
ctx_len = 1024
n_layer = 6
n_embd = 512
UNKNOWN_CHAR = ' ' # here we just set it to [space] for simplicity
elif TOKEN_MODE == 'pile':
WORD_NAME = ['20B_tokenizer.json', '20B_tokenizer.json']
MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'
ctx_len = 1024
n_layer = 12
n_embd = 768
UNKNOWN_CHAR = None
model_name = 'trained-1' model_type = 'RWKV'
from src.utils import TOKENIZER from src.utils import TOKENIZER
tokenizer = TOKENIZER('vocab', UNKNOWN_CHAR=' ') tokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)
if TOKEN_MODE == 'pile':
tokenizer.vocab_size = 50277
######################################################################################################## ########################################################################################################
@ -36,17 +50,22 @@ if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
elif os.environ['RWKV_FLOAT_MODE'] == 'bf16': elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
model_train = model_train.bfloat16() model_train = model_train.bfloat16()
print('loading ' + model_name) print('loading ' + MODEL_NAME)
m2 = torch.load(model_name + '.pth', map_location=RUN_DEVICE) m2 = torch.load(MODEL_NAME + '.pth', map_location=RUN_DEVICE)
model_train.load_state_dict(m2) model_train.load_state_dict(m2)
model_rnn = RWKV_RNN(model_name, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len) model_rnn = RWKV_RNN(MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)
model_gpt = RWKV_GPT(model_name, RUN_DEVICE, model_type, tokenizer.vocab_size, n_layer, n_embd, ctx_len).cuda() model_gpt = RWKV_GPT(MODEL_NAME, RUN_DEVICE, model_type, tokenizer.vocab_size, n_layer, n_embd, ctx_len).cuda()
######################################################################################################## ########################################################################################################
context = '\nIn a' # context = '\nIn a'
ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context] context = '\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'
if TOKEN_MODE == 'char':
ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]
elif TOKEN_MODE == 'pile':
ctx = tokenizer.tokenizer.encode(context)
print(f'input len {len(ctx)} data {ctx}') print(f'input len {len(ctx)} data {ctx}')
######################################################################################################## ########################################################################################################
@ -67,5 +86,5 @@ for i in range(src_len):
print('...') print('...')
print('\nRWKV-train output') print('\nRWKV-train output')
out = model_train.forward(torch.tensor([ctx]).cuda())[0][0].detach().cpu().numpy() out = model_train.forward(torch.tensor([ctx]).cuda())[0][0].detach().cpu().float().numpy()
print(out, '\n') print(out, '\n')

Loading…
Cancel
Save