supports RWKV-4 pile models

3 years ago · 68c486ad10
parent 61b7c429df
commit 68c486ad10
9 changed files with 100864 additions and 282 deletions
--- a/RWKV-v4/20B_tokenizer.json
+++ b/RWKV-v4/20B_tokenizer.json
--- a/RWKV-v4/deepspeed.json
+++ b/RWKV-v4/deepspeed.json
@ -1,37 +0,0 @@
 {
    "zero_allow_untested_optimizer":true,
    "zero_optimization":{
        "stage":2,
        "contiguous_gradients":true,
        "overlap_comm":true,
        "allgather_partitions":true,
        "reduce_scatter":true,
        "allgather_bucket_size":200000000,
        "reduce_bucket_size":200000000,
        "sub_group_size":1000000000000
    },
    "activation_checkpointing":{
        "partition_activations":false,
        "cpu_checkpointing":false,
        "contiguous_memory_optimization":false,
        "synchronize_checkpoint_boundary":false
    },
    "aio":{
        "block_size":1048576,
        "queue_depth":8,
        "single_submit":false,
        "overlap_events":true,
        "thread_count":1
    },
    "gradient_clipping": 1.0,
    "gradient_accumulation_steps": 1,
    "fp16": {
        "fp16": true,
        "enabled": true,
        "loss_scale": 0,
        "initial_scale_power": 12,
        "loss_scale_window": 1000,
        "hysteresis": 2,
        "min_loss_scale": 1
    }
 }
--- a/RWKV-v4/run.py
+++ b/RWKV-v4/run.py
@ -15,70 +15,94 @@ torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.matmul.allow_tf32 = True
 np.set_printoptions(precision=4, suppress=True, linewidth=200)
-### Step 1: set model ##################################################################################
+########################################################################################################
 # Step 1: set model
 # 
 # Set TOKEN_MODE to 'char' or 'bpe' if the model is trained by 'train.py' from scratch.
 #
 # Set TOKEN_MODE to 'pile' if you want to test pre-trained pile models.
 ########################################################################################################
-os.environ['RWKV_FLOAT_MODE'] = 'bf16' # 'bf16' or 'fp16'
+TOKEN_MODE = 'char' # char / bpe / pile
 os.environ['RWKV_RUN_DEVICE'] = 'cpu'   # 'cpu' (already very fast) or 'cuda'
 RUN_DEVICE = os.environ['RWKV_RUN_DEVICE']
 ctx_len = 1024
 n_layer = 6
 n_embd = 512
-model_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre'
+ctx_len = 1024
 ### Step 2: set vocab & context ########################################################################
 CHAR_MODE = True # True False
-if CHAR_MODE:
+if TOKEN_MODE == 'char':
    ### example 1: char-level model
    MODEL_NAME = 'trained-500'  # your trained model
    WORD_NAME = 'vocab'         # the .json vocab (generated by train.py)
-    # --> set UNKNOWN_CHAR to the rarest token in your vocab.json <--
+    # set UNKNOWN_CHAR to the rarest token in your vocab.json, and all unknown tokens in your prompt will be denoted by it
-    # --> all unknown tokens in your context will be denoted by it <--
+    UNKNOWN_CHAR = ' '          # here we just set it to ' ' for simplicity
-    UNKNOWN_CHAR = ' '   # here we just set it to [space] for simplicity
+
-    context = "\nIn the"        # your prompt
+elif TOKEN_MODE == 'bpe':
-else:
+    MODEL_NAME = 'trained-500'  # your trained model
-    ### example 2: BPE-level model
+    WORD_NAME = ['model-vocab.json', 'model-merges.txt'] # [vocab, merge] for your BPE model
    MODEL_NAME = 'trained-7773'
    WORD_NAME = ['model-vocab.json', 'model-merges.txt'] # [vocab, merge]
    UNKNOWN_CHAR = None
    context = 'A'
-### Step 3: other config ###############################################################################
+elif TOKEN_MODE == 'pile':
    WORD_NAME = ['20B_tokenizer.json', '20B_tokenizer.json']
    UNKNOWN_CHAR = None
-DEBUG_DEBUG = False  # True False - show softmax output
+    #---> you can set MODEL_NAME to your fine-tuned model <---
    MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'
    # MODEL_NAME = 'trained-11'
    n_layer = 12
    n_embd = 768
    ctx_len = 1024
    # MODEL_NAME = 'RWKV-4-Pile-430M-20220808-8066'
    # n_layer = 24
    # n_embd = 1024
    # ctx_len = 1024
 os.environ['RWKV_FLOAT_MODE'] = 'fp32'  # 'bf16' / 'fp16' / 'fp32' (note: only using fp32 at this moment)
 os.environ['RWKV_RUN_DEVICE'] = 'cpu'   # 'cpu' (already very fast) or 'cuda'
 model_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre'
 ########################################################################################################
 # Step 2: set prompt & sampling stuffs
 ########################################################################################################
 # context = 'A'
 # context = "\nIn the"
 # context = '\nSugar:'
 context = '\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'
 NUM_TRIALS = 999
-LENGTH_PER_TRIAL = 500
+LENGTH_PER_TRIAL = 333
 TEMPERATURE = 1.0
 top_p = 0.7
-top_p_newline = 0.9
+top_p_newline = 0.9 # only used in TOKEN_MODE = char
 DEBUG_DEBUG = False  # True False --> show softmax output
 ########################################################################################################
 print(f'Loading {MODEL_NAME}...')
 from src.model_run import RWKV_RNN
-model = RWKV_RNN(MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)
+model = RWKV_RNN(MODEL_NAME, os.environ['RWKV_RUN_DEVICE'], model_type, n_layer, n_embd, ctx_len)
 tokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)
 ########################################################################################################
-context = tokenizer.refine_context(context)
+if tokenizer.charMode:
-print('\nYour prompt has ' + str(len(context)) + ' tokens.')
+    context = tokenizer.refine_context(context)
    ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]
 else:
    ctx = tokenizer.tokenizer.encode(context)
 src_len = len(ctx)
 src_ctx = ctx.copy()
 print('\nYour prompt has ' + str(src_len) + ' tokens.')
 print('\n--> Currently the first run takes a while if your prompt is long, as we are using RNN to process the prompt. Use GPT to build the hidden state for better speed. <--\n')
 for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):
    t_begin = time.time_ns()
    src_len = len(context)
    if tokenizer.charMode:
        ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]
    else:
        ctx = tokenizer.tokenizer.encode(context)
    print(('-' * 30) + context, end='')
-
+    ctx = src_ctx.copy()
    model.clear()
    if TRIAL == 0:
        init_state = types.SimpleNamespace()
@ -104,6 +128,9 @@ for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):
            print('model', np.array(x), '==>', np.array(
                out), np.max(out), np.min(out))
        if TOKEN_MODE == 'pile':
            out[0] = -999999999  # disable <|endoftext|>
        char = tokenizer.sample_logits(out, x, ctx_len, temperature=TEMPERATURE,
                                       top_p_usual=top_p, top_p_newline=top_p_newline)
        char = char.item()
@ -112,5 +139,6 @@ for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):
        else:
            print(tokenizer.tokenizer.decode(int(char)), end='', flush=True)
        ctx += [char]
    t_end = time.time_ns()
    print("\n----------", round((t_end - t_begin) / (10 ** 9), 2), end='s ')
--- a/RWKV-v4/src/model.py
+++ b/RWKV-v4/src/model.py
@ -26,74 +26,58 @@ from torch.utils.cpp_extension import load
 wkv_cuda = load(name="wkv", sources=["cuda/wkv_op.cpp", "cuda/wkv_cuda.cu"],
                verbose=True, extra_cuda_cflags=['--use_fast_math', '--extra-device-vectorization', f'-DTmax={T_MAX}'])
-if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
+class WKV(torch.autograd.Function):
-    class WKV(torch.autograd.Function):
+    @staticmethod
-        @staticmethod
+    def forward(ctx, B, T, C, w, u, k, v):
-        def forward(ctx, B, T, C, w, u, k, v):
+        ctx.B = B
-            ctx.B = B
+        ctx.T = T
-            ctx.T = T
+        ctx.C = C
-            ctx.C = C
+        assert T <= T_MAX
-            assert T <= T_MAX
+        assert B * C % min(C, 1024) == 0
-            assert B * C % min(C, 1024) == 0
+        if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
            w = -torch.exp(w.float().contiguous())
            u = u.float().contiguous()
            k = k.float().contiguous()
            v = v.float().contiguous()
-            ctx.save_for_backward(w, u, k, v)
+        else:
-            y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
+            w = -torch.exp(w.contiguous())
-            wkv_cuda.forward(B, T, C, w, u, k, v, y)
+            u = u.contiguous()
            k = k.contiguous()
            v = v.contiguous()
        ctx.save_for_backward(w, u, k, v)
        y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
        wkv_cuda.forward(B, T, C, w, u, k, v, y)
        if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
            return y.half()
-
+        elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
        @staticmethod
        def backward(ctx, gy):
            B = ctx.B
            T = ctx.T
            C = ctx.C
            assert T <= T_MAX
            assert B * C % min(C, 1024) == 0
            w, u, k, v = ctx.saved_tensors
            gw = torch.zeros((B, C), device='cuda')
            gu = torch.zeros((B, C), device='cuda')
            gk = torch.zeros((B, T, C), device='cuda')
            gv = torch.zeros((B, T, C), device='cuda')
            wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
            gw = torch.sum(gw, dim=0)
            gu = torch.sum(gu, dim=0)
            return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
 elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
    class WKV(torch.autograd.Function):
        @staticmethod
        def forward(ctx, B, T, C, w, u, k, v):
            ctx.B = B
            ctx.T = T
            ctx.C = C
            assert T <= T_MAX
            assert B * C % min(C, 1024) == 0
            w = -torch.exp(w.float().contiguous())
            u = u.float().contiguous()
            k = k.float().contiguous()
            v = v.float().contiguous()
            ctx.save_for_backward(w, u, k, v)
            y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
            wkv_cuda.forward(B, T, C, w, u, k, v, y)
            return y.bfloat16()
-
+        elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
-        @staticmethod
+            return y
-        def backward(ctx, gy):
+
-            B = ctx.B
+    @staticmethod
-            T = ctx.T
+    def backward(ctx, gy):
-            C = ctx.C
+        B = ctx.B
-            assert T <= T_MAX
+        T = ctx.T
-            assert B * C % min(C, 1024) == 0
+        C = ctx.C
-            w, u, k, v = ctx.saved_tensors
+        assert T <= T_MAX
-            gw = torch.zeros((B, C), device='cuda')
+        assert B * C % min(C, 1024) == 0
-            gu = torch.zeros((B, C), device='cuda')
+        w, u, k, v = ctx.saved_tensors
-            gk = torch.zeros((B, T, C), device='cuda')
+        gw = torch.zeros((B, C), device='cuda')
-            gv = torch.zeros((B, T, C), device='cuda')
+        gu = torch.zeros((B, C), device='cuda')
        gk = torch.zeros((B, T, C), device='cuda')
        gv = torch.zeros((B, T, C), device='cuda')
        if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
            wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
-            gw = torch.sum(gw, dim=0)
+        else:
-            gu = torch.sum(gu, dim=0)
+            wkv_cuda.backward(B, T, C, w, u, k, v, gy.contiguous(), gw, gu, gk, gv)
        gw = torch.sum(gw, dim=0)
        gu = torch.sum(gu, dim=0)
        if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
            return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
        elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
            return (None, None, None, gw.bfloat16(), gu.bfloat16(), gk.bfloat16(), gv.bfloat16())
        elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
            return (None, None, None, gw, gu, gk, gv)
 def RUN_CUDA(B, T, C, w, u, k, v):
    return WKV.apply(B, T, C, w.cuda(), u.cuda(), k.cuda(), v.cuda())
@ -376,6 +360,8 @@ class GPT(nn.Module):
                c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).half()
            elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
                c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).bfloat16()
            elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
                c = c @ F.one_hot(idx, num_classes=self.config.vocab_size)
            x = self.head(x) + c
        else:
--- a/RWKV-v4/src/model_run.py
+++ b/RWKV-v4/src/model_run.py
@ -26,74 +26,58 @@ if os.environ['RWKV_RUN_DEVICE'] == 'cuda':
    wkv_cuda = load(name="wkv", sources=["cuda/wkv_op.cpp", "cuda/wkv_cuda.cu"],
                    verbose=True, extra_cuda_cflags=['--use_fast_math', '--extra-device-vectorization', f'-DTmax={T_MAX}'])
-    if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
+    class WKV(torch.autograd.Function):
-        class WKV(torch.autograd.Function):
+        @staticmethod
-            @staticmethod
+        def forward(ctx, B, T, C, w, u, k, v):
-            def forward(ctx, B, T, C, w, u, k, v):
+            ctx.B = B
-                ctx.B = B
+            ctx.T = T
-                ctx.T = T
+            ctx.C = C
-                ctx.C = C
+            assert T <= T_MAX
-                assert T <= T_MAX
+            assert B * C % min(C, 1024) == 0
-                assert B * C % min(C, 1024) == 0
+            if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
                w = -torch.exp(w.float().contiguous())
                u = u.float().contiguous()
                k = k.float().contiguous()
                v = v.float().contiguous()
-                ctx.save_for_backward(w, u, k, v)
+            else:
-                y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
+                w = -torch.exp(w.contiguous())
-                wkv_cuda.forward(B, T, C, w, u, k, v, y)
+                u = u.contiguous()
                k = k.contiguous()
                v = v.contiguous()
            ctx.save_for_backward(w, u, k, v)
            y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
            wkv_cuda.forward(B, T, C, w, u, k, v, y)
            if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
                return y.half()
-
+            elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
            @staticmethod
            def backward(ctx, gy):
                B = ctx.B
                T = ctx.T
                C = ctx.C
                assert T <= T_MAX
                assert B * C % min(C, 1024) == 0
                w, u, k, v = ctx.saved_tensors
                gw = torch.zeros((B, C), device='cuda')
                gu = torch.zeros((B, C), device='cuda')
                gk = torch.zeros((B, T, C), device='cuda')
                gv = torch.zeros((B, T, C), device='cuda')
                wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
                gw = torch.sum(gw, dim=0)
                gu = torch.sum(gu, dim=0)
                return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
    elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
        class WKV(torch.autograd.Function):
            @staticmethod
            def forward(ctx, B, T, C, w, u, k, v):
                ctx.B = B
                ctx.T = T
                ctx.C = C
                assert T <= T_MAX
                assert B * C % min(C, 1024) == 0
                w = -torch.exp(w.float().contiguous())
                u = u.float().contiguous()
                k = k.float().contiguous()
                v = v.float().contiguous()
                ctx.save_for_backward(w, u, k, v)
                y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)
                wkv_cuda.forward(B, T, C, w, u, k, v, y)
                return y.bfloat16()
-
+            elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
-            @staticmethod
+                return y
-            def backward(ctx, gy):
+
-                B = ctx.B
+        @staticmethod
-                T = ctx.T
+        def backward(ctx, gy):
-                C = ctx.C
+            B = ctx.B
-                assert T <= T_MAX
+            T = ctx.T
-                assert B * C % min(C, 1024) == 0
+            C = ctx.C
-                w, u, k, v = ctx.saved_tensors
+            assert T <= T_MAX
-                gw = torch.zeros((B, C), device='cuda')
+            assert B * C % min(C, 1024) == 0
-                gu = torch.zeros((B, C), device='cuda')
+            w, u, k, v = ctx.saved_tensors
-                gk = torch.zeros((B, T, C), device='cuda')
+            gw = torch.zeros((B, C), device='cuda')
-                gv = torch.zeros((B, T, C), device='cuda')
+            gu = torch.zeros((B, C), device='cuda')
            gk = torch.zeros((B, T, C), device='cuda')
            gv = torch.zeros((B, T, C), device='cuda')
            if os.environ['RWKV_FLOAT_MODE'] != 'fp32':
                wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)
-                gw = torch.sum(gw, dim=0)
+            else:
-                gu = torch.sum(gu, dim=0)
+                wkv_cuda.backward(B, T, C, w, u, k, v, gy.contiguous(), gw, gu, gk, gv)
            gw = torch.sum(gw, dim=0)
            gu = torch.sum(gu, dim=0)
            if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
                return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())
            elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
                return (None, None, None, gw.bfloat16(), gu.bfloat16(), gk.bfloat16(), gv.bfloat16())
            elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
                return (None, None, None, gw, gu, gk, gv)
    def RUN_CUDA(B, T, C, w, u, k, v):
        return WKV.apply(B, T, C, w.cuda(), u.cuda(), k.cuda(), v.cuda())
--- a/RWKV-v4/src/trainer.py
+++ b/RWKV-v4/src/trainer.py
@ -19,6 +19,21 @@ torch.backends.cudnn.benchmark = True
 torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.matmul.allow_tf32 = True
 class L2Wrap(torch.autograd.Function):
    @staticmethod
    def forward(ctx, loss, y):
        ctx.save_for_backward(y)
        return loss
    @staticmethod
    def backward(ctx, grad_output):
        y = ctx.saved_tensors[0]
        # to encourage the logits to be close to 0
        factor = 1e-4 / (y.shape[0] * y.shape[1])
        maxx, ids = torch.max(y, -1, keepdim=True)
        gy = torch.zeros_like(y)
        gy.scatter_(-1, ids, maxx * factor)
        return (grad_output, gy)
 class TrainerConfig:
    batch_size = 64
    learning_rate = 4e-4
@ -109,14 +124,15 @@ class Trainer(LightningLite):
            for it, (x, y) in pbar:
                with torch.set_grad_enabled(is_train):
-                    _, loss = model(x, y) # forward the model
+                    yyy, loss = model(x, y) # forward the model
                    lossL2 = L2Wrap.apply(loss, yyy)
                all_loss = [loss.clone() for _ in range(NUM_GPUS)]
                torch.distributed.all_gather(all_loss, loss)
                if is_train:  # backprop and update the parameters
                    model.zero_grad()
-                    self.backward(loss)
+                    self.backward(lossL2)
                    # deepspeed will handle gradient_clipping
--- a/RWKV-v4/src/utils.py
+++ b/RWKV-v4/src/utils.py
@ -22,13 +22,19 @@ class Dataset(Dataset):
        self.data = data
        if 'MMapIndexedDataset' in str(type(self.data)):
-            self.vocab_size = 253 # your vocab_size
+            self.vocab_size = int(os.environ['VOCAB_SIZE'])
-            print('current vocab size = ', self.vocab_size, "(make sure it's correct)")
+            print('current vocab size =', self.vocab_size, "(make sure it's correct)")
            self.data_size = len(self.data._bin_buffer) // 2
-            self.item_cnt = len(self.data)            
+            print(f'data has {self.data_size} tokens.')
        elif 'numpy' in str(type(self.data)):
            self.vocab_size = int(os.environ['VOCAB_SIZE'])
            print('current vocab size =', self.vocab_size, "(make sure it's correct)")
            self.data_size = len(self.data)
            print(f'data has {self.data_size} tokens.')
        else:
            print('building token list...', end=' ')
            unique = sorted(list(set(data)))
            self.vocab_size = len(unique)
            # print()
            # for u in unique:
            #     print(u, end=' ')
@ -41,25 +47,25 @@ class Dataset(Dataset):
                xx += 1
            with open('vocab.json', "w", encoding="utf-16") as vocab_file:
                vocab_file.write(json.dumps(xxObj, ensure_ascii=False))
-
+            self.data_size = len(self.data)
-            data_size, vocab_size = len(data), len(unique)
+            print('data has %d tokens, %d unique.' % (self.data_size, self.vocab_size))
            print('data has %d tokens, %d unique.' % (data_size, vocab_size))
            self.stoi = {ch: i for i, ch in enumerate(unique)}
            self.itos = {i: ch for i, ch in enumerate(unique)}
            self.vocab_size = vocab_size
    def __len__(self):
        return self.epoch_length_fixed // NUM_GPUS
    def __getitem__(self, idx):
-        # cheat: pick a random spot in dataset
+        #
        # we are cheating: pick a random spot in dataset
        #
        i = np.random.randint(0, self.data_size - (self.ctx_len + 1))
        if 'MMapIndexedDataset' in str(type(self.data)):
            i = np.random.randint(0, self.data_size - (self.ctx_len + 1))      
            dix = self.data.get(idx=0, offset=i, length=self.ctx_len + 1).astype(int)
        elif 'numpy' in str(type(self.data)):
            dix = self.data[i:i+self.ctx_len+1]
        else:
-            i = np.random.randint(0, len(self.data) - (self.ctx_len + 1))
+            dix = [self.stoi[s] for s in self.data[i:i+self.ctx_len+1]]
            chunk = self.data[i:i+self.ctx_len+1]
            dix = [self.stoi[s] for s in chunk]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
@ -70,8 +76,12 @@ class TOKENIZER():
    def __init__(self, WORD_NAME, UNKNOWN_CHAR='\ue083'):
        if 'list' in str(type(WORD_NAME)):
            self.charMode = False
-            from transformers import GPT2TokenizerFast
+            if WORD_NAME[0] == WORD_NAME[1]:
-            self.tokenizer = GPT2TokenizerFast(WORD_NAME[0], WORD_NAME[1])
+                from transformers import PreTrainedTokenizerFast
                self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=WORD_NAME[0])
            else:
                from transformers import GPT2TokenizerFast
                self.tokenizer = GPT2TokenizerFast(WORD_NAME[0], WORD_NAME[1])
        else:
            self.charMode = True
            with open(WORD_NAME + '.json', "r", encoding="utf-16") as result_file:
@ -85,15 +95,13 @@ class TOKENIZER():
            self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR]
    def refine_context(self, context):
-        if self.charMode:
+        context = context.strip().split('\n')
-            context = context.strip().split('\n')
+        for c in range(len(context)):
-            for c in range(len(context)):
+            context[c] = context[c].strip().strip('\u3000').strip('\r')
-                context[c] = context[c].strip().strip('\u3000').strip('\r')
+        context = list(filter(lambda c: c != '', context))
-            context = list(filter(lambda c: c != '', context))
+        context = '\n' + ('\n'.join(context)).strip()
-            context = '\n' + ('\n'.join(context)).strip()
+        if context == '':
-            if context == '':
+            context = '\n'
                context = '\n'
        return context
    def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None):
--- a/RWKV-v4/train.py
+++ b/RWKV-v4/train.py
@ -3,39 +3,11 @@
 ########################################################################################################
 import os
 os.environ['USE_WANDB'] = '0' # 0 = False, 1 = True
 os.environ['RWKV_FLOAT_MODE'] = 'bf16' # 'bf16' (stable) or 'fp16' (will overflow after training a large model for very long. can be solved in the future)
 ### This is using DeepSpeed stage2 + FP16 ##############################################################
 # 
 # Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training:
 # 1) leave RWKV_NUM_GPUS = '1' and let it run for 1 'mini-epoch' and it will save a 'trained-1.pth'
 # 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = NUM_GPUS * single_gpu_batchsz, 
 #    EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training
 #
 os.environ['RWKV_NUM_GPUS'] = '1' # num of GPUs to use
 NUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])
 ### Change these if you want to continue training from a saved model ###################################
 EPOCH_BEGIN = 0
 LOAD_MODEL = False # shall we continue from the #EPOCH_BEGIN model?
 os.environ['RWKV_LOAD_MODEL'] = str(LOAD_MODEL)
 ########################################################################################################
 # if False: # True False ---> Set to False if you don't understand it
 #     print("\n\n[[[ SPECIAL DEBUG MODE FOR MYSELF. DON'T ENABLE THIS IF YOU DON'T UNDERSTAND IT ]]]\n\n")
 #     import src.utils
 #     src.utils.set_seed(42) # make training deterministic (including dataloader). if you are doing this, remember to change seed when you load a model (otherwise the dataloader loads old samples)
 import logging, types
 from src.utils import Dataset
 import torch
 import numpy as np
-from src.binidx import MMapIndexedDataset # for the Megatron-LM 'binidx' format
+from src.binidx import MMapIndexedDataset
 np.set_printoptions(precision=4, suppress=True, linewidth=200)
 logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@ -44,35 +16,95 @@ torch.backends.cudnn.benchmark = True
 torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.matmul.allow_tf32 = True
-### Step 1: set training data ##########################################################################
+# if False: # True False ---> Set to False if you don't understand it
 #     print("\n\n[[[ SPECIAL DEBUG MODE FOR MYSELF. DON'T ENABLE THIS IF YOU DON'T UNDERSTAND IT ]]]\n\n")
 #     import src.utils
 #     src.utils.set_seed(42) # make training deterministic (including dataloader). if you are doing this, remember to change seed when you load a model (otherwise the dataloader loads old samples)
 ########################################################################################################
 # Step 1: set training data & cfg
 ########################################################################################################
 EXPRESS_PILE_MODE = False # True: express mode for fine-tuning a pile model // False: usual training
 EXPRESS_PILE_MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'
 EXPRESS_PILE_MODEL_TYPE = 'RWKV-4-Pile-169M'
 # EXPRESS_PILE_MODEL_NAME = 'RWKV-4-Pile-430M-20220808-8066'
 # EXPRESS_PILE_MODEL_TYPE = 'RWKV-4-Pile-430M'
 ########################################################################################################
 datafile = "../data/enwik8" # your data
-datafile_encoding = 'utf-8' # 'utf-8' 'utf-16le' 'binidx'
+datafile_encoding = 'utf-8' # 'utf-8' / 'utf-16le' / 'numpy' (for fine-tuning pile models) / 'binidx' (the Megatron-LM 'binidx' format)
-# datafile = './my-gpt_seq_document'
+# datafile = 'my-gpt_seq_document'
 # datafile_encoding = 'binidx'
-### Step 2: set model size #############################################################################
+if EXPRESS_PILE_MODE:
    datafile = 'train.npy' # use 'prepare-data.py' in https://github.com/BlinkDL/RWKV-v2-RNN-Pile/tree/main/RWKV-v3 to tokenize .txt into .npy
    datafile_encoding = 'numpy'
 #
 # set VOCAB_SIZE = 0 (auto-compute) if you are training a char-level LM from scratch
 # set VOCAB_SIZE = 50277 for fine-tuning pile models
 # set VOCAB_SIZE = your_vocab_size for 'binidx' data
 #
 os.environ['VOCAB_SIZE'] = '0'
 if EXPRESS_PILE_MODE:
    os.environ['VOCAB_SIZE'] = '50277'
 #
 # Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training:
 # 1) set RWKV_NUM_GPUS = '1' and let it run for 1 miniEpoch and it will save a trained-1.pth
 # 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = single_gpu_batchsz * RWKV_NUM_GPUS,
 #    EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training from it
 #
 os.environ['RWKV_NUM_GPUS'] = '1' # num of GPUs to use
 os.environ['RWKV_FLOAT_MODE'] = 'bf16' # 'bf16' (stable) or 'fp16' (will overflow after training a large model for very long. can be solved in the future) or 'fp32'
 os.environ['USE_WANDB'] = '0' # wandb logging. 0 = False, 1 = True
 ########################################################################################################
 # Step 2: set model details
 ########################################################################################################
 EPOCH_BEGIN = 0 # begins with miniEpoch = EPOCH_BEGIN
 LOAD_MODEL = False # shall we load the #EPOCH_BEGIN model and continue the training from it?
 ctx_len = 1024 # increase T_MAX in model.py if your ctx_len is very long
 n_layer = 6
 n_embd = 512
 ctx_len = 1024 # increase T_MAX in src/model.py if your ctx_len is very long
-# 'RWKV' or 'RWKV-ffnPre' (better in some cases)
+model_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre' (sometimes better)
 model_type = 'RWKV'
-# ---> there is also a RWKV_HEAD_QK_DIM in model.py and model_run.py <---
+# there is also a RWKV_HEAD_QK_DIM in model.py and model_run.py
-# set it to 256, then it's using my headQK trick (similar to a tiny attention) to improve loss
+# set it to 256, then it's using my headQK trick (a tiny attention) to improve loss
 # set it to 0, then it's a pure RNN (attention-free)
-### Step 3: set batch size #############################################################################
+if EXPRESS_PILE_MODE:
    LOAD_MODEL = True
    if EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-169M':
        n_layer = 12
        n_embd = 768
        ctx_len = 1024
    elif EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-430M':
        n_layer = 24
        n_embd = 1024
        ctx_len = 1024
    elif EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-1B5':
        n_layer = 24
        n_embd = 2048
        ctx_len = 1024
 ########################################################################################################
 # Step 3: set batch size & learning rate etc.
 ########################################################################################################
 # if you see "CUDA out of memory", reduce batch_size. Use nvidia-smi to find the highest value for your GPU.
-batch_size = 12 * NUM_GPUS
+batch_size = 12 * int(os.environ['RWKV_NUM_GPUS'])
-assert (batch_size % NUM_GPUS == 0)
+assert (batch_size % int(os.environ['RWKV_NUM_GPUS']) == 0)
 ### Step 4: set learning rate, number of mini-epochs #######################################################
 #
 # By default we are using exponential LR decay.
 # Here are my suggestions for training.
 # Let's say you are training a L6-D512 model.
@ -93,34 +125,51 @@ epoch_length_fixed = (10000 // batch_size) * batch_size # feel free to increase
 # epoch_save_frequency 0 = never, 1 = every mini-epoch, 2 = every two mini-epochs, ...
 epoch_save_frequency = 10
 epoch_save_path = 'trained-'
 MODEL_NAME = epoch_save_path + str(EPOCH_BEGIN)
-########################################################################################################
+if EXPRESS_PILE_MODE:
    if EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-169M':
        lr_init = 2e-5
    else:
        lr_init = 1e-5
    lr_final = 1e-5
    n_epoch = 100000
 ### misc stuffs ########################################################################################
 if LOAD_MODEL and EPOCH_BEGIN > 0: # we are not saving gradients, so let's have some warmup if we load a model
    warmup_tokens = 50 * ctx_len * batch_size // NUM_GPUS
 else:
    warmup_tokens = 0
-betas = (0.9, 0.99)
+betas = (0.9, 0.99) # set betas = (0.9, 0.999) if your model has been trained for a while
 eps = 1e-8
 num_workers = 1 # DataLoader worker. I only tested num_workers = 1
 NUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])
 os.environ['RWKV_LOAD_MODEL'] = str(LOAD_MODEL)
 MODEL_NAME = epoch_save_path + str(EPOCH_BEGIN)
 if EXPRESS_PILE_MODE:
    betas = (0.9, 0.999)
    MODEL_NAME = EXPRESS_PILE_MODEL_NAME
 ########################################################################################################
 # Load data
 ########################################################################################################
-print('loading data... ' + datafile)
+print(f'loading {datafile_encoding} data... ' + datafile)
-if datafile_encoding != 'binidx':
+if datafile_encoding == 'binidx':
    train_dataset = Dataset(open(
        datafile, "r", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed)
 else:
    train_dataset = Dataset(MMapIndexedDataset(datafile), ctx_len, epoch_length_fixed)
 elif datafile_encoding == 'numpy':
    train_dataset = Dataset(np.load(datafile).astype('int'), ctx_len, epoch_length_fixed)
 else:
    train_dataset = Dataset(open(datafile, "r", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed)
 ########################################################################################################
 # Train model
 ########################################################################################################
 if __name__ == '__main__':
    from src.trainer import Trainer, TrainerConfig
@ -180,12 +229,16 @@ if __name__ == '__main__':
            "min_loss_scale": 1
        }
        trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator="gpu", precision=16)
    elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
        DEEPSPEED_CFG["bf16"] = {
            "enabled": True
        }
        trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator="gpu", precision='bf16')
    elif os.environ['RWKV_FLOAT_MODE'] == 'fp32':
        trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator="gpu", precision=32)
    print(trainer._strategy.config)
    trainer.run(m_cfg, train_dataset, None, tconf)
--- a/RWKV-v4/verify.py
+++ b/RWKV-v4/verify.py
@ -17,15 +17,29 @@ import torch
 from src.model_run import RWKV_RNN, RWKV_GPT
 from src.model import GPT, GPTConfig
-ctx_len = 1024
+TOKEN_MODE = 'pile' # char / pile
-n_layer = 6
+
-n_embd = 512
+if TOKEN_MODE == 'char':
-model_type = 'RWKV'
+    MODEL_NAME = 'trained-1'
    WORD_NAME = 'vocab'         # the .json vocab (generated by train.py)
    ctx_len = 1024
    n_layer = 6
    n_embd = 512
    UNKNOWN_CHAR = ' '   # here we just set it to [space] for simplicity
 elif TOKEN_MODE == 'pile':
    WORD_NAME = ['20B_tokenizer.json', '20B_tokenizer.json']
    MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'
    ctx_len = 1024
    n_layer = 12
    n_embd = 768
    UNKNOWN_CHAR = None
-model_name = 'trained-1'
+model_type = 'RWKV'
 from src.utils import TOKENIZER
-tokenizer = TOKENIZER('vocab', UNKNOWN_CHAR=' ')
+tokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)
 if TOKEN_MODE == 'pile':
    tokenizer.vocab_size = 50277
 ########################################################################################################
@ -36,17 +50,22 @@ if os.environ['RWKV_FLOAT_MODE'] == 'fp16':
 elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':
    model_train = model_train.bfloat16()
-print('loading ' + model_name)
+print('loading ' + MODEL_NAME)
-m2 = torch.load(model_name + '.pth', map_location=RUN_DEVICE)
+m2 = torch.load(MODEL_NAME + '.pth', map_location=RUN_DEVICE)
 model_train.load_state_dict(m2)
-model_rnn = RWKV_RNN(model_name, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)
+model_rnn = RWKV_RNN(MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)
-model_gpt = RWKV_GPT(model_name, RUN_DEVICE, model_type, tokenizer.vocab_size, n_layer, n_embd, ctx_len).cuda()
+model_gpt = RWKV_GPT(MODEL_NAME, RUN_DEVICE, model_type, tokenizer.vocab_size, n_layer, n_embd, ctx_len).cuda()
 ########################################################################################################
-context = '\nIn a'
+# context = '\nIn a'
-ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]
+context = '\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'
 if TOKEN_MODE == 'char':
    ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]
 elif TOKEN_MODE == 'pile':
    ctx = tokenizer.tokenizer.encode(context)
 print(f'input len {len(ctx)} data {ctx}')
 ########################################################################################################
@ -67,5 +86,5 @@ for i in range(src_len):
        print('...')
 print('\nRWKV-train output')
-out = model_train.forward(torch.tensor([ctx]).cuda())[0][0].detach().cpu().numpy()
+out = model_train.forward(torch.tensor([ctx]).cuda())[0][0].detach().cpu().float().numpy()
 print(out, '\n')