img models (wip)

4 years ago · 40e91dd1d7
parent 3a7e6a6aa3
commit 40e91dd1d7
4 changed files with 247 additions and 29 deletions
--- a/RWKV-v4neo/src/dataset.py
+++ b/RWKV-v4neo/src/dataset.py
@ -2,7 +2,7 @@
 # The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM
 ########################################################################################################
-import json, math
+import json, math, random
 import numpy as np
 import torch
 from torch.utils.data import Dataset
@ -37,6 +37,19 @@ class MyDataset(Dataset):
            print("Current vocab size =", self.vocab_size, "(make sure it's correct)")
            self.data_size = len(self.data)
            print(f"Data has {self.data_size} tokens.")
        elif args.data_type == "wds_img":
            def identity(x):
                return x            
            import torchvision as vision
            import webdataset as wds
            import torchvision.transforms as transforms
            img_transform = transforms.Compose(
                [transforms.CenterCrop(256)]
            )
            self.data = iter(wds.WebDataset(args.data_file, resampled=True).shuffle(1000).decode("torchrgb").to_tuple("jpg", "json", "txt").map_tuple(img_transform, identity, identity).with_epoch(1000000))
            print("WebDataset loaded.")
            self.vocab_size = -1
            self.data_size = -1
        else:
            if args.data_type == "dummy":
                print("Building dummy data...")
@ -71,15 +84,17 @@ class MyDataset(Dataset):
        return self.args.epoch_steps * self.args.micro_bsz
    def __getitem__(self, idx):
        #
        # we are cheating: pick a random spot in dataset
        #
        args = self.args
        rank = self.global_rank
        epoch = self.real_epoch
        world_size = self.world_size
        # print(f"epoch {epoch} idx {idx} rank {rank}/{world_size}")
        if args.data_type == "wds_img":            
            dd = next(self.data) # jpg, json, txt
            # print(f"epoch {epoch} idx {idx} rank {rank}/{world_size} {dd[2]}")
            return dd[0], dd[2]
        else:
            ctx_len = args.ctx_len
            req_len = ctx_len + 1
@ -91,6 +106,7 @@ class MyDataset(Dataset):
                i = i + args.my_pile_shift
                # print(f"epoch {epoch} idx {idx} rank {rank}/{world_size} ii {ii} pos {round(i / self.data_size, 3)}")
            else:
                # cheat: pick a random spot in dataset
                i = np.random.randint(0, self.data_size - req_len)
            if args.data_type == "binidx":
--- a/RWKV-v4neo/src/model.py
+++ b/RWKV-v4neo/src/model.py
@ -3,7 +3,6 @@
 ########################################################################################################
 import os, math, gc
 from re import L
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
--- a/RWKV-v4neo/src/model_img.py
+++ b/RWKV-v4neo/src/model_img.py
@ -0,0 +1,195 @@
 ########################################################################################################
 # The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM
 ########################################################################################################
 import os, math, gc
 import torchvision as vision
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 import pytorch_lightning as pl
 from pytorch_lightning.utilities import rank_zero_info, rank_zero_only
 from pytorch_lightning.strategies import DeepSpeedStrategy
 import deepspeed
 from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
 from pytorch_msssim import SSIM
 class To2Bin(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        return torch.floor(x + torch.empty_like(x).uniform_(0, 1))
    @staticmethod
    def backward(ctx, grad_output):
        return grad_output.clone()
 def __nop(ob):
    return ob
 MyModule = nn.Module
 MyFunction = __nop
 if os.environ["RWKV_JIT_ON"] == "1":
    MyModule = torch.jit.ScriptModule
    MyFunction = torch.jit.script_method
 class RWKV_IMG(pl.LightningModule):
    def __init__(self, args):
        super().__init__()
        self.args = args
        self.e0b0 = nn.BatchNorm2d(12)
        self.e0w0 = nn.Conv2d(12, 12, kernel_size = 3, stride = 1, padding = 1)
        self.e0b1 = nn.BatchNorm2d(12)
        self.e0w1 = nn.Conv2d(12, 12, kernel_size = 3, stride = 1, padding = 1)
        self.e0b2 = nn.BatchNorm2d(12)
        self.e0w2 = nn.Conv2d(12, 12, kernel_size = 3, stride = 1, padding = 1)
        self.e0b3 = nn.BatchNorm2d(12)
        self.e0w3 = nn.Conv2d(12, 12, kernel_size = 3, stride = 1, padding = 1)
        self.e1b0 = nn.BatchNorm2d(48)
        self.e1w0 = nn.Conv2d(48, 48, kernel_size = 3, stride = 1, padding = 1)
        self.e1b1 = nn.BatchNorm2d(48)
        self.e1w1 = nn.Conv2d(48, 48, kernel_size = 3, stride = 1, padding = 1)
        self.e1b2 = nn.BatchNorm2d(48)
        self.e1w2 = nn.Conv2d(48, 48, kernel_size = 3, stride = 1, padding = 1)
        self.e1b3 = nn.BatchNorm2d(48)
        self.e1w3 = nn.Conv2d(48, 48, kernel_size = 3, stride = 1, padding = 1)
        self.e2b0 = nn.BatchNorm2d(192)
        self.e2w0 = nn.Conv2d(192, 192, kernel_size = 3, stride = 1, padding = 1)
        self.e2b1 = nn.BatchNorm2d(192)
        self.e2w1 = nn.Conv2d(192, 192, kernel_size = 3, stride = 1, padding = 1)
        self.e2b2 = nn.BatchNorm2d(192)
        self.e2w2 = nn.Conv2d(192, 192, kernel_size = 3, stride = 1, padding = 1)
        self.e2b3 = nn.BatchNorm2d(192)
        self.e2w3 = nn.Conv2d(192, 192, kernel_size = 3, stride = 1, padding = 1)
        self.ewww = nn.Conv2d(192, 8, kernel_size = 3, stride = 1, padding = 1)
        self.dwww = nn.Conv2d(8, 192, kernel_size = 3, stride = 1, padding = 1)
        self.d0b0 = nn.BatchNorm2d(192)
        self.d0w0 = nn.Conv2d(192, 192, kernel_size = 3, stride = 1, padding = 1)
        self.d0b1 = nn.BatchNorm2d(192)
        self.d0w1 = nn.Conv2d(192, 192, kernel_size = 3, stride = 1, padding = 1)
        self.d0b2 = nn.BatchNorm2d(192)
        self.d0w2 = nn.Conv2d(192, 192, kernel_size = 3, stride = 1, padding = 1)
        self.d0b3 = nn.BatchNorm2d(192)
        self.d0w3 = nn.Conv2d(192, 192, kernel_size = 3, stride = 1, padding = 1)
        self.d1b0 = nn.BatchNorm2d(48)
        self.d1w0 = nn.Conv2d(48, 48, kernel_size = 3, stride = 1, padding = 1)
        self.d1b1 = nn.BatchNorm2d(48)
        self.d1w1 = nn.Conv2d(48, 48, kernel_size = 3, stride = 1, padding = 1)
        self.d1b2 = nn.BatchNorm2d(48)
        self.d1w2 = nn.Conv2d(48, 48, kernel_size = 3, stride = 1, padding = 1)
        self.d1b3 = nn.BatchNorm2d(48)
        self.d1w3 = nn.Conv2d(48, 48, kernel_size = 3, stride = 1, padding = 1)
        self.d2b0 = nn.BatchNorm2d(12)
        self.d2w0 = nn.Conv2d(12, 12, kernel_size = 3, stride = 1, padding = 1)
        self.d2b1 = nn.BatchNorm2d(12)
        self.d2w1 = nn.Conv2d(12, 12, kernel_size = 3, stride = 1, padding = 1)
        self.d2b2 = nn.BatchNorm2d(12)
        self.d2w2 = nn.Conv2d(12, 12, kernel_size = 3, stride = 1, padding = 1)
        self.d2b3 = nn.BatchNorm2d(12)
        self.d2w3 = nn.Conv2d(12, 12, kernel_size = 3, stride = 1, padding = 1)
        self.SSIM = SSIM(data_range=1, size_average=True, channel=3)
    def configure_optimizers(self):
        args = self.args
        optim_groups = [
            {"params": [p for n, p in self.named_parameters()], "weight_decay": 0.0},
        ]
        if self.deepspeed_offload:
            return DeepSpeedCPUAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adamw_mode=False, weight_decay=0, amsgrad=False)
        return FusedAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adam_w_mode=False, weight_decay=0, amsgrad=False)
        # return ZeroOneAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, weight_decay=0, amsgrad=False, cuda_aware=False)
    @property
    def deepspeed_offload(self) -> bool:
        strategy = self.trainer.strategy
        if isinstance(strategy, DeepSpeedStrategy):
            config = strategy.config["zero_optimization"]
            return config.get("offload_optimizer") or config.get("offload_param")
        return False
    def forward(self, img):
        x = img
        x = F.pixel_unshuffle(x, 2)
        x = x + self.e0w1(F.mish(self.e0b1(self.e0w0(F.mish(self.e0b0(x))))))
        x = x + self.e0w3(F.mish(self.e0b3(self.e0w2(F.mish(self.e0b2(x))))))
        x = F.pixel_unshuffle(x, 2)
        x = x + self.e1w1(F.mish(self.e1b1(self.e1w0(F.mish(self.e1b0(x))))))
        x = x + self.e1w3(F.mish(self.e1b3(self.e1w2(F.mish(self.e1b2(x))))))
        x = F.pixel_unshuffle(x, 2)
        x = x + self.e2w1(F.mish(self.e2b1(self.e2w0(F.mish(self.e2b0(x))))))
        x = x + self.e2w3(F.mish(self.e2b3(self.e2w2(F.mish(self.e2b2(x))))))
        x = self.ewww(x)
        x = To2Bin.apply(torch.sigmoid(x))
        # print(x.shape, x)
        x = self.dwww(x)
        x = x + self.d0w1(F.mish(self.d0b1(self.d0w0(F.mish(self.d0b0(x))))))
        x = x + self.d0w3(F.mish(self.d0b3(self.d0w2(F.mish(self.d0b2(x))))))
        x = F.pixel_shuffle(x, 2)
        x = x + self.d1w1(F.mish(self.d1b1(self.d1w0(F.mish(self.d1b0(x))))))
        x = x + self.d1w3(F.mish(self.d1b3(self.d1w2(F.mish(self.d1b2(x))))))
        x = F.pixel_shuffle(x, 2)
        x = x + self.d2w1(F.mish(self.d2b1(self.d2w0(F.mish(self.d2b0(x))))))
        x = x + self.d2w3(F.mish(self.d2b3(self.d2w2(F.mish(self.d2b2(x))))))
        x = F.pixel_shuffle(x, 2)
        x = torch.sigmoid(x)
        return x
    def training_step(self, batch, batch_idx):
        args = self.args
        img, txt = batch
        out = self(img)
        if self.trainer.is_global_zero:
            if (self.trainer.global_step+1) % (100 * int(args.devices)) == 0:
                vision.utils.save_image(img[:4], f"test/image_model/{self.trainer.global_step}-src.jpg")
                vision.utils.save_image(out[:4], f"test/image_model/{self.trainer.global_step}-out.jpg")
        return 1 - self.SSIM(out.float(), img.float())
    def training_step_end(self, batch_parts):
        all = self.all_gather(batch_parts)
        if self.trainer.is_global_zero:
            self.trainer.my_loss_all = all
    def generate_init_weight(self):
        print(
            f"""
 ############################################################################
 #
 # Init model weight (slow for large models)...
 #
 ############################################################################
 """
        )
        m = {}
        for n in self.state_dict():
            p = self.state_dict()[n]
            shape = p.shape
            m[n] = p
            m[n] = m[n].cpu()
            if os.environ["RWKV_FLOAT_MODE"] == "fp16":
                m[n] = m[n].half()
            elif os.environ["RWKV_FLOAT_MODE"] == "bf16":
                m[n] = m[n].bfloat16()
        gc.collect()
        torch.cuda.empty_cache()
        return m
--- a/RWKV-v4neo/train.py
+++ b/RWKV-v4neo/train.py
@ -158,7 +158,7 @@ if __name__ == "__main__":
                if args.my_pile_stage == 2:
                    args.warmup_steps = 10
                else:
-                    args.warmup_steps = 50
+                    args.warmup_steps = 30
            args.epoch_begin = max_p + 1
    samples_per_epoch = args.epoch_steps * args.real_bsz
@ -188,7 +188,7 @@ if __name__ == "__main__":
    )
    rank_zero_info(str(vars(args)) + "\n")
-    assert args.data_type in ["utf-8", "utf-16le", "numpy", "binidx", "dummy"]
+    assert args.data_type in ["utf-8", "utf-16le", "numpy", "binidx", "dummy", "wds_img"]
    if args.lr_final == 0 or args.lr_init == 0:
        rank_zero_info("\n\nNote: lr_final = 0 or lr_init = 0. Using linear LR schedule instead.\n\n")
@ -223,11 +223,15 @@ if __name__ == "__main__":
    from src.trainer import train_callback, generate_init_weight
    from src.dataset import MyDataset
    from src.model import RWKV
    train_data = MyDataset(args)
    args.vocab_size = train_data.vocab_size
    if args.data_type == 'wds_img':
        from src.model_img import RWKV_IMG
        model = RWKV_IMG(args)
    else:
        from src.model import RWKV
        model = RWKV(args)
    if len(args.load_model) == 0 or args.my_pile_stage == 1:  # shall we build the initial weights?
@ -250,6 +254,10 @@ if __name__ == "__main__":
            print(f"Trying {args.load_model}")
            load_dict = torch.load(args.load_model, map_location="cpu")
    # load_keys = load_dict.keys()
    # for k in model.state_dict():
    #     if k not in load_keys:
    #         load_dict[k] = model.state_dict()[k]
    model.load_state_dict(load_dict)
    trainer = Trainer.from_argparse_args(