diff --git a/RWKV-v4/src/model.py b/RWKV-v4/src/model.py
index 6151b65..40c279c 100644
--- a/RWKV-v4/src/model.py
+++ b/RWKV-v4/src/model.py
@@ -68,8 +68,8 @@ def RUN_CUDA(B, T, C, w, u, k, v):
 ########################################################################################################
 
 def RWKV_Init(module, config):  # fancy initialization of all lin & emb layer in the module
-    print('\n[--> first run, init model params (very slow for large models) <--]\n')
-    print('\n[so you shall only do it for 1 single GPU and save the checkpt and load it when using multiple GPU]\n')
+    print('\n[--> first run, init model params (very slow for large models) <--]')
+    print('[so you shall only do it for 1 single GPU and save the checkpt and load it when using multiple GPU]\n')
     for m in module.modules():
         if not isinstance(m, (nn.Linear, nn.Embedding)):
             continue
diff --git a/RWKV-v4/train.py b/RWKV-v4/train.py
index 865a715..4c670d3 100644
--- a/RWKV-v4/train.py
+++ b/RWKV-v4/train.py
@@ -7,7 +7,12 @@ import os
 os.environ['USE_WANDB'] = '0' # 0 = False, 1 = True
 
 ### This is using DeepSpeed stage2 + FP16 ##############################################################
-
+# 
+# Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training:
+# 1) leave RWKV_NUM_GPUS = '1' and let it run for 1 'mini-epoch' and it will save a 'trained-1.pth'
+# 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = NUM_GPUS * single_gpu_batchsz, 
+#    EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training
+#
 os.environ['RWKV_NUM_GPUS'] = '1' # num of GPUs to use
 NUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])