supports RWKV-4 pile models

4 years ago · 68c486ad10
parent 61b7c429df
commit 68c486ad10
9 changed files with 100864 additions and 282 deletions
--- a/RWKV-v4/20B_tokenizer.json
+++ b/RWKV-v4/20B_tokenizer.json
--- a/RWKV-v4/deepspeed.json
+++ b/RWKV-v4/deepspeed.json
@ -1,37 +0,0 @@
-{
-    "zero_allow_untested_optimizer":true,
-    "zero_optimization":{
-        "stage":2,
-        "contiguous_gradients":true,
-        "overlap_comm":true,
-        "allgather_partitions":true,
-        "reduce_scatter":true,
-        "allgather_bucket_size":200000000,
-        "reduce_bucket_size":200000000,
-        "sub_group_size":1000000000000
-    },
-    "activation_checkpointing":{
-        "partition_activations":false,
-        "cpu_checkpointing":false,
-        "contiguous_memory_optimization":false,
-        "synchronize_checkpoint_boundary":false
-    },
-    "aio":{
-        "block_size":1048576,
-        "queue_depth":8,
-        "single_submit":false,
-        "overlap_events":true,
-        "thread_count":1
-    },
-    "gradient_clipping": 1.0,
-    "gradient_accumulation_steps": 1,
-    "fp16": {
-        "fp16": true,
-        "enabled": true,
-        "loss_scale": 0,
-        "initial_scale_power": 12,
-        "loss_scale_window": 1000,
-        "hysteresis": 2,
-        "min_loss_scale": 1
-    }
-}
--- a/RWKV-v4/run.py
+++ b/RWKV-v4/run.py
@ -15,70 +15,94 @@ torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.matmul.allow_tf32 = True
 np.set_printoptions(precision=4, suppress=True, linewidth=200)

-### Step 1: set model ##################################################################################
+########################################################################################################
+# Step 1: set model
+# 
+# Set TOKEN_MODE to 'char' or 'bpe' if the model is trained by 'train.py' from scratch.
+#
+# Set TOKEN_MODE to 'pile' if you want to test pre-trained pile models.
+########################################################################################################

-os.environ['RWKV_FLOAT_MODE'] = 'bf16' # 'bf16' or 'fp16'
-os.environ['RWKV_RUN_DEVICE'] = 'cpu'   # 'cpu' (already very fast) or 'cuda'
-RUN_DEVICE = os.environ['RWKV_RUN_DEVICE']
+TOKEN_MODE = 'char' # char / bpe / pile

-ctx_len = 1024
 n_layer = 6
 n_embd = 512
-model_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre'
-
-### Step 2: set vocab & context ########################################################################
-
-CHAR_MODE = True # True False
+ctx_len = 1024

-if CHAR_MODE:
-    ### example 1: char-level model
+if TOKEN_MODE == 'char':
    MODEL_NAME = 'trained-500'  # your trained model
    WORD_NAME = 'vocab'         # the .json vocab (generated by train.py)
-    # --> set UNKNOWN_CHAR to the rarest token in your vocab.json <--
-    # --> all unknown tokens in your context will be denoted by it <--
-    UNKNOWN_CHAR = ' '   # here we just set it to [space] for simplicity
-    context = "\nIn the"        # your prompt
-else:
-    ### example 2: BPE-level model
-    MODEL_NAME = 'trained-7773'
-    WORD_NAME = ['model-vocab.json', 'model-merges.txt'] # [vocab, merge]
+    # set UNKNOWN_CHAR to the rarest token in your vocab.json, and all unknown tokens in your prompt will be denoted by it
+    UNKNOWN_CHAR = ' '          # here we just set it to ' ' for simplicity
+
+elif TOKEN_MODE == 'bpe':
+    MODEL_NAME = 'trained-500'  # your trained model
+    WORD_NAME = ['model-vocab.json', 'model-merges.txt'] # [vocab, merge] for your BPE model
    UNKNOWN_CHAR = None
-    context = 'A'

-### Step 3: other config ###############################################################################
+elif TOKEN_MODE == 'pile':
+    WORD_NAME = ['20B_tokenizer.json', '20B_tokenizer.json']
+    UNKNOWN_CHAR = None
+
+    #---> you can set MODEL_NAME to your fine-tuned model <---
+
+    MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'
+    # MODEL_NAME = 'trained-11'
+    n_layer = 12
+    n_embd = 768
+    ctx_len = 1024
+
+    # MODEL_NAME = 'RWKV-4-Pile-430M-20220808-8066'
+    # n_layer = 24
+    # n_embd = 1024
+    # ctx_len = 1024
+
+os.environ['RWKV_FLOAT_MODE'] = 'fp32'  # 'bf16' / 'fp16' / 'fp32' (note: only using fp32 at this moment)
+os.environ['RWKV_RUN_DEVICE'] = 'cpu'   # 'cpu' (already very fast) or 'cuda'
+model_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre'
+
+########################################################################################################
+# Step 2: set prompt & sampling stuffs
+########################################################################################################

-DEBUG_DEBUG = False  # True False - show softmax output
+# context = 'A'
+# context = "\nIn the"
+# context = '\nSugar:'
+context = '\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'

 NUM_TRIALS = 999
-LENGTH_PER_TRIAL = 500
+LENGTH_PER_TRIAL = 333

 TEMPERATURE = 1.0
 top_p = 0.7
-top_p_newline = 0.9
+top_p_newline = 0.9 # only used in TOKEN_MODE = char
+
+DEBUG_DEBUG = False  # True False --> show softmax output

 ########################################################################################################

 print(f'Loading {MODEL_NAME}...')
 from src.model_run import RWKV_RNN
-model = RWKV_RNN(MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)
+model = RWKV_RNN(MODEL_NAME, os.environ['RWKV_RUN_DEVICE'], model_type, n_layer, n_embd, ctx_len)
 tokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)

 ########################################################################################################

+if tokenizer.charMode:
    context = tokenizer.refine_context(context)
-print('\nYour prompt has ' + str(len(context)) + ' tokens.')
+    ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]
+else:
+    ctx = tokenizer.tokenizer.encode(context)
+src_len = len(ctx)
+src_ctx = ctx.copy()
+
+print('\nYour prompt has ' + str(src_len) + ' tokens.')
 print('\n--> Currently the first run takes a while if your prompt is long, as we are using RNN to process the prompt. Use GPT to build the hidden state for better speed. <--\n')

 for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):
    t_begin = time.time_ns()
-
-    src_len = len(context)
-    if tokenizer.charMode:
-        ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]
-    else:
-        ctx = tokenizer.tokenizer.encode(context)
    print(('-' * 30) + context, end='')
-
+    ctx = src_ctx.copy()
    model.clear()
    if TRIAL == 0:
        init_state = types.SimpleNamespace()
@ -104,6 +128,9 @@ for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):
            print('model', np.array(x), '==>', np.array(
                out), np.max(out), np.min(out))

+        if TOKEN_MODE == 'pile':
+            out[0] = -999999999  # disable <|endoftext|>
+
        char = tokenizer.sample_logits(out, x, ctx_len, temperature=TEMPERATURE,
                                       top_p_usual=top_p, top_p_newline=top_p_newline)
        char = char.item()
@ -112,5 +139,6 @@ for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):
        else:
            print(tokenizer.tokenizer.decode(int(char)), end='', flush=True)
        ctx += [char]
+
    t_end = time.time_ns()
    print("\n----------", round((t_end - t_begin) / (10 ** 9), 2), end='s ')
--- a/RWKV-v4/src/model.py
+++ b/RWKV-v4/src/model.py
@ -26,7 +26,6 @@ from torch.utils.cpp_extension import load
 wkv_cuda = load(name="wkv", sources=["cuda/wkv_op.cpp", "cuda/wkv_cuda.cu"],
                verbose=True, extra_cuda_cflags=['--use_fast_math', '--extra-device-vectorization', f'-DTmax={T_MAX}'])

-if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
 class WKV(torch.autograd.Function):
    @staticmethod
    def forward(ctx, B, T, C, w, u, k, v):
@ -35,48 +34,25 @@ if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
        ctx.C = C
        assert T <= T_MAX
        assert B * C % min(C, 1024) == 0
+        if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
            w = -torch.exp(w.float().contiguous())
            u = u.float().contiguous()
            k = k.float().contiguous()
            v = v.float().contiguous()
+        else:
+            w = -torch.exp(w.contiguous())
+            u = u.contiguous()
+            k = k.contiguous()
+            v = v.contiguous()
        ctx.save_for_backward(w, u, k, v)
        y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
        wkv_cuda.forward(B, T, C, w, u, k, v, y)
+        if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
            return y.half()
-
-        @staticmethod
-        def backward(ctx, gy):
-            B = ctx.B
-            T = ctx.T
-            C = ctx.C
-            assert T <= T_MAX
-            assert B * C % min(C, 1024) == 0
-            w, u, k, v = ctx.saved_tensors
-            gw = torch.zeros((B, C), device='cuda')
-            gu = torch.zeros((B, C), device='cuda')
-            gk = torch.zeros((B, T, C), device='cuda')
-            gv = torch.zeros((B, T, C), device='cuda')
-            wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
-            gw = torch.sum(gw, dim=0)
-            gu = torch.sum(gu, dim=0)
-            return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
        elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
-    class WKV(torch.autograd.Function):
-        @staticmethod
-        def forward(ctx, B, T, C, w, u, k, v):
-            ctx.B = B
-            ctx.T = T
-            ctx.C = C
-            assert T <= T_MAX
-            assert B * C % min(C, 1024) == 0
-            w = -torch.exp(w.float().contiguous())
-            u = u.float().contiguous()
-            k = k.float().contiguous()
-            v = v.float().contiguous()
-            ctx.save_for_backward(w, u, k, v)
-            y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
-            wkv_cuda.forward(B, T, C, w, u, k, v, y)
            return y.bfloat16()
+        elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
+            return y

    @staticmethod
    def backward(ctx, gy):
@ -90,10 +66,18 @@ elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
        gu = torch.zeros((B, C), device='cuda')
        gk = torch.zeros((B, T, C), device='cuda')
        gv = torch.zeros((B, T, C), device='cuda')
+        if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
            wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
+        else:
+            wkv_cuda.backward(B, T, C, w, u, k, v, gy.contiguous(), gw, gu, gk, gv)
        gw = torch.sum(gw, dim=0)
        gu = torch.sum(gu, dim=0)
+        if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
+            return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
+        elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
            return (None, None, None, gw.bfloat16(), gu.bfloat16(), gk.bfloat16(), gv.bfloat16())
+        elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
+            return (None, None, None, gw, gu, gk, gv)

 def RUN_CUDA(B, T, C, w, u, k, v):
    return WKV.apply(B, T, C, w.cuda(), u.cuda(), k.cuda(), v.cuda())
@ -376,6 +360,8 @@ class GPT(nn.Module):
                c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).half()
            elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
                c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).bfloat16()
+            elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
+                c = c @ F.one_hot(idx, num_classes=self.config.vocab_size)

            x = self.head(x) + c
        else:
--- a/RWKV-v4/src/model_run.py
+++ b/RWKV-v4/src/model_run.py
@ -26,7 +26,6 @@ if os.environ['RWKV_RUN_DEVICE'] == 'cuda':
    wkv_cuda = load(name="wkv", sources=["cuda/wkv_op.cpp", "cuda/wkv_cuda.cu"],
                    verbose=True, extra_cuda_cflags=['--use_fast_math', '--extra-device-vectorization', f'-DTmax={T_MAX}'])

-    if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
    class WKV(torch.autograd.Function):
        @staticmethod
        def forward(ctx, B, T, C, w, u, k, v):
@ -35,48 +34,25 @@ if os.environ['RWKV_RUN_DEVICE'] == 'cuda':
            ctx.C = C
            assert T <= T_MAX
            assert B * C % min(C, 1024) == 0
+            if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
                w = -torch.exp(w.float().contiguous())
                u = u.float().contiguous()
                k = k.float().contiguous()
                v = v.float().contiguous()
+            else:
+                w = -torch.exp(w.contiguous())
+                u = u.contiguous()
+                k = k.contiguous()
+                v = v.contiguous()
            ctx.save_for_backward(w, u, k, v)
            y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
            wkv_cuda.forward(B, T, C, w, u, k, v, y)
+            if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
                return y.half()
-
-            @staticmethod
-            def backward(ctx, gy):
-                B = ctx.B
-                T = ctx.T
-                C = ctx.C
-                assert T <= T_MAX
-                assert B * C % min(C, 1024) == 0
-                w, u, k, v = ctx.saved_tensors
-                gw = torch.zeros((B, C), device='cuda')
-                gu = torch.zeros((B, C), device='cuda')
-                gk = torch.zeros((B, T, C), device='cuda')
-                gv = torch.zeros((B, T, C), device='cuda')
-                wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
-                gw = torch.sum(gw, dim=0)
-                gu = torch.sum(gu, dim=0)
-                return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
            elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
-        class WKV(torch.autograd.Function):
-            @staticmethod
-            def forward(ctx, B, T, C, w, u, k, v):
-                ctx.B = B
-                ctx.T = T
-                ctx.C = C
-                assert T <= T_MAX
-                assert B * C % min(C, 1024) == 0
-                w = -torch.exp(w.float().contiguous())
-                u = u.float().contiguous()
-                k = k.float().contiguous()
-                v = v.float().contiguous()
-                ctx.save_for_backward(w, u, k, v)
-                y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
-                wkv_cuda.forward(B, T, C, w, u, k, v, y)
                return y.bfloat16()
+            elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
+                return y

        @staticmethod
        def backward(ctx, gy):
@ -90,10 +66,18 @@ if os.environ['RWKV_RUN_DEVICE'] == 'cuda':
            gu = torch.zeros((B, C), device='cuda')
            gk = torch.zeros((B, T, C), device='cuda')
            gv = torch.zeros((B, T, C), device='cuda')
+            if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
                wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
+            else:
+                wkv_cuda.backward(B, T, C, w, u, k, v, gy.contiguous(), gw, gu, gk, gv)
            gw = torch.sum(gw, dim=0)
            gu = torch.sum(gu, dim=0)
+            if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
+                return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
+            elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
                return (None, None, None, gw.bfloat16(), gu.bfloat16(), gk.bfloat16(), gv.bfloat16())
+            elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
+                return (None, None, None, gw, gu, gk, gv)

    def RUN_CUDA(B, T, C, w, u, k, v):
        return WKV.apply(B, T, C, w.cuda(), u.cuda(), k.cuda(), v.cuda())
--- a/RWKV-v4/src/trainer.py
+++ b/RWKV-v4/src/trainer.py
@ -19,6 +19,21 @@ torch.backends.cudnn.benchmark = True
 torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.matmul.allow_tf32 = True

+class L2Wrap(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, loss, y):
+        ctx.save_for_backward(y)
+        return loss
+    @staticmethod
+    def backward(ctx, grad_output):
+        y = ctx.saved_tensors[0]
+        # to encourage the logits to be close to 0
+        factor = 1e-4 / (y.shape[0] * y.shape[1])
+        maxx, ids = torch.max(y, -1, keepdim=True)
+        gy = torch.zeros_like(y)
+        gy.scatter_(-1, ids, maxx * factor)
+        return (grad_output, gy)
+
 class TrainerConfig:
    batch_size = 64
    learning_rate = 4e-4
@ -109,14 +124,15 @@ class Trainer(LightningLite):
            
            for it, (x, y) in pbar:
                with torch.set_grad_enabled(is_train):
-                    _, loss = model(x, y) # forward the model
+                    yyy, loss = model(x, y) # forward the model
+                    lossL2 = L2Wrap.apply(loss, yyy)

                all_loss = [loss.clone() for _ in range(NUM_GPUS)]
                torch.distributed.all_gather(all_loss, loss)

                if is_train:  # backprop and update the parameters
                    model.zero_grad()
-                    self.backward(loss)
+                    self.backward(lossL2)

                    # deepspeed will handle gradient_clipping

--- a/RWKV-v4/src/utils.py
+++ b/RWKV-v4/src/utils.py
@ -22,13 +22,19 @@ class Dataset(Dataset):
        self.data = data

        if 'MMapIndexedDataset' in str(type(self.data)):
-            self.vocab_size = 253 # your vocab_size
+            self.vocab_size = int(os.environ['VOCAB_SIZE'])
            print('current vocab size =', self.vocab_size, "(make sure it's correct)")
            self.data_size = len(self.data._bin_buffer) // 2
-            self.item_cnt = len(self.data)            
+            print(f'data has {self.data_size} tokens.')
+        elif 'numpy' in str(type(self.data)):
+            self.vocab_size = int(os.environ['VOCAB_SIZE'])
+            print('current vocab size =', self.vocab_size, "(make sure it's correct)")
+            self.data_size = len(self.data)
+            print(f'data has {self.data_size} tokens.')
        else:
            print('building token list...', end=' ')
            unique = sorted(list(set(data)))
+            self.vocab_size = len(unique)
            # print()
            # for u in unique:
            #     print(u, end=' ')
@ -41,25 +47,25 @@ class Dataset(Dataset):
                xx += 1
            with open('vocab.json', "w", encoding="utf-16") as vocab_file:
                vocab_file.write(json.dumps(xxObj, ensure_ascii=False))
-
-            data_size, vocab_size = len(data), len(unique)
-            print('data has %d tokens, %d unique.' % (data_size, vocab_size))
+            self.data_size = len(self.data)
+            print('data has %d tokens, %d unique.' % (self.data_size, self.vocab_size))
            self.stoi = {ch: i for i, ch in enumerate(unique)}
            self.itos = {i: ch for i, ch in enumerate(unique)}
-            self.vocab_size = vocab_size

    def __len__(self):
        return self.epoch_length_fixed // NUM_GPUS

    def __getitem__(self, idx):
-        # cheat: pick a random spot in dataset
-        if 'MMapIndexedDataset' in str(type(self.data)):
+        #
+        # we are cheating: pick a random spot in dataset
+        #
        i = np.random.randint(0, self.data_size - (self.ctx_len + 1))
+        if 'MMapIndexedDataset' in str(type(self.data)):
            dix = self.data.get(idx=0, offset=i, length=self.ctx_len + 1).astype(int)
+        elif 'numpy' in str(type(self.data)):
+            dix = self.data[i:i+self.ctx_len+1]
        else:
-            i = np.random.randint(0, len(self.data) - (self.ctx_len + 1))
-            chunk = self.data[i:i+self.ctx_len+1]
-            dix = [self.stoi[s] for s in chunk]
+            dix = [self.stoi[s] for s in self.data[i:i+self.ctx_len+1]]
        
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
@ -70,6 +76,10 @@ class TOKENIZER():
    def __init__(self, WORD_NAME, UNKNOWN_CHAR='\ue083'):
        if 'list' in str(type(WORD_NAME)):
            self.charMode = False
+            if WORD_NAME[0] == WORD_NAME[1]:
+                from transformers import PreTrainedTokenizerFast
+                self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=WORD_NAME[0])
+            else:
                from transformers import GPT2TokenizerFast
                self.tokenizer = GPT2TokenizerFast(WORD_NAME[0], WORD_NAME[1])
        else:
@ -85,7 +95,6 @@ class TOKENIZER():
            self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR]

    def refine_context(self, context):
-        if self.charMode:
        context = context.strip().split('\n')
        for c in range(len(context)):
            context[c] = context[c].strip().strip('\u3000').strip('\r')
@ -93,7 +102,6 @@ class TOKENIZER():
        context = '\n' + ('\n'.join(context)).strip()
        if context == '':
            context = '\n'
-
        return context

    def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None):
--- a/RWKV-v4/train.py
+++ b/RWKV-v4/train.py
@ -3,39 +3,11 @@
 ########################################################################################################

 import os
-
-os.environ['USE_WANDB'] = '0' # 0 = False, 1 = True
-
-os.environ['RWKV_FLOAT_MODE'] = 'bf16' # 'bf16' (stable) or 'fp16' (will overflow after training a large model for very long. can be solved in the future)
-
-### This is using DeepSpeed stage2 + FP16 ##############################################################
-# 
-# Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training:
-# 1) leave RWKV_NUM_GPUS = '1' and let it run for 1 'mini-epoch' and it will save a 'trained-1.pth'
-# 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = NUM_GPUS * single_gpu_batchsz, 
-#    EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training
-#
-os.environ['RWKV_NUM_GPUS'] = '1' # num of GPUs to use
-NUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])
-
-### Change these if you want to continue training from a saved model ###################################
-
-EPOCH_BEGIN = 0
-LOAD_MODEL = False # shall we continue from the #EPOCH_BEGIN model?
-os.environ['RWKV_LOAD_MODEL'] = str(LOAD_MODEL)
-
-########################################################################################################
-
-# if False: # True False ---> Set to False if you don't understand it
-#     print("\n\n[[[ SPECIAL DEBUG MODE FOR MYSELF. DON'T ENABLE THIS IF YOU DON'T UNDERSTAND IT ]]]\n\n")
-#     import src.utils
-#     src.utils.set_seed(42) # make training deterministic (including dataloader). if you are doing this, remember to change seed when you load a model (otherwise the dataloader loads old samples)
-
 import logging, types
 from src.utils import Dataset
 import torch
 import numpy as np
-from src.binidx import MMapIndexedDataset # for the Megatron-LM 'binidx' format
+from src.binidx import MMapIndexedDataset

 np.set_printoptions(precision=4, suppress=True, linewidth=200)
 logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@ -44,35 +16,95 @@ torch.backends.cudnn.benchmark = True
 torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.matmul.allow_tf32 = True

-### Step 1: set training data ##########################################################################
+# if False: # True False ---> Set to False if you don't understand it
+#     print("\n\n[[[ SPECIAL DEBUG MODE FOR MYSELF. DON'T ENABLE THIS IF YOU DON'T UNDERSTAND IT ]]]\n\n")
+#     import src.utils
+#     src.utils.set_seed(42) # make training deterministic (including dataloader). if you are doing this, remember to change seed when you load a model (otherwise the dataloader loads old samples)
+
+########################################################################################################
+# Step 1: set training data & cfg
+########################################################################################################
+
+EXPRESS_PILE_MODE = False # True: express mode for fine-tuning a pile model // False: usual training
+
+EXPRESS_PILE_MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'
+EXPRESS_PILE_MODEL_TYPE = 'RWKV-4-Pile-169M'
+# EXPRESS_PILE_MODEL_NAME = 'RWKV-4-Pile-430M-20220808-8066'
+# EXPRESS_PILE_MODEL_TYPE = 'RWKV-4-Pile-430M'
+
+########################################################################################################

 datafile = "../data/enwik8" # your data
-datafile_encoding = 'utf-8' # 'utf-8' 'utf-16le' 'binidx'
+datafile_encoding = 'utf-8' # 'utf-8' / 'utf-16le' / 'numpy' (for fine-tuning pile models) / 'binidx' (the Megatron-LM 'binidx' format)

-# datafile = './my-gpt_seq_document'
+# datafile = 'my-gpt_seq_document'
 # datafile_encoding = 'binidx'

-### Step 2: set model size #############################################################################
+if EXPRESS_PILE_MODE:
+    datafile = 'train.npy' # use 'prepare-data.py' in https://github.com/BlinkDL/RWKV-v2-RNN-Pile/tree/main/RWKV-v3 to tokenize .txt into .npy
+    datafile_encoding = 'numpy'
+
+#
+# set VOCAB_SIZE = 0 (auto-compute) if you are training a char-level LM from scratch
+# set VOCAB_SIZE = 50277 for fine-tuning pile models
+# set VOCAB_SIZE = your_vocab_size for 'binidx' data
+#
+os.environ['VOCAB_SIZE'] = '0'
+if EXPRESS_PILE_MODE:
+    os.environ['VOCAB_SIZE'] = '50277'
+
+#
+# Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training:
+# 1) set RWKV_NUM_GPUS = '1' and let it run for 1 miniEpoch and it will save a trained-1.pth
+# 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = single_gpu_batchsz * RWKV_NUM_GPUS,
+#    EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training from it
+#
+os.environ['RWKV_NUM_GPUS'] = '1' # num of GPUs to use
+
+os.environ['RWKV_FLOAT_MODE'] = 'bf16' # 'bf16' (stable) or 'fp16' (will overflow after training a large model for very long. can be solved in the future) or 'fp32'
+
+os.environ['USE_WANDB'] = '0' # wandb logging. 0 = False, 1 = True
+
+########################################################################################################
+# Step 2: set model details
+########################################################################################################
+
+EPOCH_BEGIN = 0 # begins with miniEpoch = EPOCH_BEGIN
+LOAD_MODEL = False # shall we load the #EPOCH_BEGIN model and continue the training from it?

-ctx_len = 1024 # increase T_MAX in model.py if your ctx_len is very long
 n_layer = 6
 n_embd = 512
+ctx_len = 1024 # increase T_MAX in src/model.py if your ctx_len is very long

-# 'RWKV' or 'RWKV-ffnPre' (better in some cases)
-model_type = 'RWKV'
+model_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre' (sometimes better)

-# ---> there is also a RWKV_HEAD_QK_DIM in model.py and model_run.py <---
-# set it to 256, then it's using my headQK trick (similar to a tiny attention) to improve loss
+# there is also a RWKV_HEAD_QK_DIM in model.py and model_run.py
+# set it to 256, then it's using my headQK trick (a tiny attention) to improve loss
 # set it to 0, then it's a pure RNN (attention-free)

-### Step 3: set batch size #############################################################################
+if EXPRESS_PILE_MODE:
+    LOAD_MODEL = True
+    if EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-169M':
+        n_layer = 12
+        n_embd = 768
+        ctx_len = 1024
+    elif EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-430M':
+        n_layer = 24
+        n_embd = 1024
+        ctx_len = 1024
+    elif EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-1B5':
+        n_layer = 24
+        n_embd = 2048
+        ctx_len = 1024
+
+########################################################################################################
+# Step 3: set batch size & learning rate etc.
+########################################################################################################

 # if you see "CUDA out of memory", reduce batch_size. Use nvidia-smi to find the highest value for your GPU.
-batch_size = 12 * NUM_GPUS
-assert (batch_size % NUM_GPUS == 0)
+batch_size = 12 * int(os.environ['RWKV_NUM_GPUS'])
+assert (batch_size % int(os.environ['RWKV_NUM_GPUS']) == 0)

-### Step 4: set learning rate, number of mini-epochs #######################################################
-#
 # By default we are using exponential LR decay.
 # Here are my suggestions for training.
 # Let's say you are training a L6-D512 model.
@ -93,34 +125,51 @@ epoch_length_fixed = (10000 // batch_size) * batch_size # feel free to increase
 # epoch_save_frequency 0 = never, 1 = every mini-epoch, 2 = every two mini-epochs, ...
 epoch_save_frequency = 10
 epoch_save_path = 'trained-'
-MODEL_NAME = epoch_save_path + str(EPOCH_BEGIN)

-########################################################################################################
+if EXPRESS_PILE_MODE:
+    if EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-169M':
+        lr_init = 2e-5
+    else:
+        lr_init = 1e-5
+    lr_final = 1e-5
+    n_epoch = 100000
+
+### misc stuffs ########################################################################################

 if LOAD_MODEL and EPOCH_BEGIN > 0: # we are not saving gradients, so let's have some warmup if we load a model
    warmup_tokens = 50 * ctx_len * batch_size // NUM_GPUS
 else:
    warmup_tokens = 0

-betas = (0.9, 0.99)
+betas = (0.9, 0.99) # set betas = (0.9, 0.999) if your model has been trained for a while
 eps = 1e-8

 num_workers = 1 # DataLoader worker. I only tested num_workers = 1

+NUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])
+os.environ['RWKV_LOAD_MODEL'] = str(LOAD_MODEL)
+MODEL_NAME = epoch_save_path + str(EPOCH_BEGIN)
+
+if EXPRESS_PILE_MODE:
+    betas = (0.9, 0.999)
+    MODEL_NAME = EXPRESS_PILE_MODEL_NAME
+
 ########################################################################################################
 # Load data
 ########################################################################################################

-print('loading data... ' + datafile)
-if datafile_encoding != 'binidx':
-    train_dataset = Dataset(open(
-        datafile, "r", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed)
-else:
+print(f'loading {datafile_encoding} data... ' + datafile)
+if datafile_encoding == 'binidx':
    train_dataset = Dataset(MMapIndexedDataset(datafile), ctx_len, epoch_length_fixed)
+elif datafile_encoding == 'numpy':
+    train_dataset = Dataset(np.load(datafile).astype('int'), ctx_len, epoch_length_fixed)
+else:
+    train_dataset = Dataset(open(datafile, "r", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed)

 ########################################################################################################
 # Train model
 ########################################################################################################
+
 if __name__ == '__main__':
    from src.trainer import Trainer, TrainerConfig

@ -180,12 +229,16 @@ if __name__ == '__main__':
            "min_loss_scale": 1
        }
        trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator="gpu", precision=16)
+        
    elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
        DEEPSPEED_CFG["bf16"] = {
            "enabled": True
        }
        trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator="gpu", precision='bf16')

+    elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
+        trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator="gpu", precision=32)
+
    print(trainer._strategy.config)

    trainer.run(m_cfg, train_dataset, None, tconf)
--- a/RWKV-v4/verify.py
+++ b/RWKV-v4/verify.py
@ -17,15 +17,29 @@ import torch
 from src.model_run import RWKV_RNN, RWKV_GPT
 from src.model import GPT, GPTConfig

+TOKEN_MODE = 'pile' # char / pile
+
+if TOKEN_MODE == 'char':
+    MODEL_NAME = 'trained-1'
+    WORD_NAME = 'vocab'         # the .json vocab (generated by train.py)
    ctx_len = 1024
    n_layer = 6
    n_embd = 512
-model_type = 'RWKV'
+    UNKNOWN_CHAR = ' '   # here we just set it to [space] for simplicity
+elif TOKEN_MODE == 'pile':
+    WORD_NAME = ['20B_tokenizer.json', '20B_tokenizer.json']
+    MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'
+    ctx_len = 1024
+    n_layer = 12
+    n_embd = 768
+    UNKNOWN_CHAR = None

-model_name = 'trained-1'
+model_type = 'RWKV'

 from src.utils import TOKENIZER
-tokenizer = TOKENIZER('vocab', UNKNOWN_CHAR=' ')
+tokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)
+if TOKEN_MODE == 'pile':
+    tokenizer.vocab_size = 50277

 ########################################################################################################

@ -36,17 +50,22 @@ if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
 elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
    model_train = model_train.bfloat16()

-print('loading ' + model_name)
-m2 = torch.load(model_name + '.pth', map_location=RUN_DEVICE)
+print('loading ' + MODEL_NAME)
+m2 = torch.load(MODEL_NAME + '.pth', map_location=RUN_DEVICE)
 model_train.load_state_dict(m2)

-model_rnn = RWKV_RNN(model_name, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)
-model_gpt = RWKV_GPT(model_name, RUN_DEVICE, model_type, tokenizer.vocab_size, n_layer, n_embd, ctx_len).cuda()
+model_rnn = RWKV_RNN(MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)
+model_gpt = RWKV_GPT(MODEL_NAME, RUN_DEVICE, model_type, tokenizer.vocab_size, n_layer, n_embd, ctx_len).cuda()

 ########################################################################################################

-context = '\nIn a'
+# context = '\nIn a'
+context = '\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'
+
+if TOKEN_MODE == 'char':
    ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]
+elif TOKEN_MODE == 'pile':
+    ctx = tokenizer.tokenizer.encode(context)
 print(f'input len {len(ctx)} data {ctx}')

 ########################################################################################################
@ -67,5 +86,5 @@ for i in range(src_len):
        print('...')

 print('\nRWKV-train output')
-out = model_train.forward(torch.tensor([ctx]).cuda())[0][0].detach().cpu().numpy()
+out = model_train.forward(torch.tensor([ctx]).cuda())[0][0].detach().cpu().float().numpy()
 print(out, '\n')