"""
Implementation of YOLOv3 architecture
"""

import torch
import torch.nn as nn
# import config as cfg
""" 
Information about architecture config:
Tuple is structured by (filters, kernel_size, stride) 
Every conv is a same convolution. 
List is structured by "B" indicating a residual block followed by the number of repeats
"S" is for scale prediction block and computing the yolo loss
"U" is for upsampling the feature map and concatenating with a previous layer
"""
config = [
    (32, 3, 1),
    (64, 3, 2),
    ["B", 1],
    (128, 3, 2),
    ["B", 2],
    (256, 3, 2),
    ["B", 8],
    (512, 3, 2),
    ["B", 8],
    (1024, 3, 2),
    ["B", 4],  # To this point is Darknet-53
    (512, 1, 1),
    (1024, 3, 1),
    "S",
    (256, 1, 1),
    "U",
    (256, 1, 1),
    (512, 3, 1),
    "S",
    (128, 1, 1),
    "U",
    (128, 1, 1),
    (256, 3, 1),
    "S",
]


class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, bn_act=True, **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=not bn_act, **kwargs)
        self.bn = nn.BatchNorm2d(out_channels)
        self.leaky = nn.LeakyReLU(0.1)
        self.use_bn_act = bn_act

    def forward(self, x):
        if self.use_bn_act:
            return self.leaky(self.bn(self.conv(x)))
        else:
            return self.conv(x)


class ResidualBlock(nn.Module):
    def __init__(self, channels, use_residual=True, num_repeats=1):
        super().__init__()
        self.layers = nn.ModuleList()
        for repeat in range(num_repeats):
            self.layers += [
                nn.Sequential(
                    CNNBlock(channels, channels // 2, kernel_size=1),
                    CNNBlock(channels // 2, channels, kernel_size=3, padding=1),
                )
            ]

        self.use_residual = use_residual
        self.num_repeats = num_repeats

    def forward(self, x):
        for layer in self.layers:
            if self.use_residual:
                x = x + layer(x)
            else:
                x = layer(x)

        return x


class ScalePrediction(nn.Module):
    def __init__(self, in_channels, num_classes):
        super().__init__()
        self.pred = nn.Sequential(
            CNNBlock(in_channels, 2 * in_channels, kernel_size=3, padding=1),
            CNNBlock(
                2 * in_channels, (num_classes + 5) * 3, bn_act=False, kernel_size=1
            ),
        )
        self.num_classes = num_classes

    def forward(self, x):
        return (
            self.pred(x)
            .reshape(x.shape[0], 3, -1 , x.shape[2], x.shape[3])
            .permute(0, 1, 3, 4, 2)
        )


class YOLOv3(nn.Module):
    def __init__(self, in_channels=3, num_classes=80):
        super().__init__()
        self.num_classes = num_classes
        self.in_channels = in_channels
        self.layers = self._create_conv_layers()
        self.base_model = None
        # self.distill_feature = cfg.DISTILL
        # self.warp = cfg.WARP
        self.feature_store = None 
        self.enable_warp_train = False

    def get_features(self):
        return self.features 
    
    def adaptation(self, layer_id, num_class, in_feature, old_class):
        with torch.no_grad():
            old_weight = self.layers[layer_id].pred[1].conv.weight
            old_bias = self.layers[layer_id].pred[1].conv.bias
            # print(model.layers[22].pred[1])
            # print(model.layers[29].pred[1])
            # out_dims = cfg.BASE_CLASS + cfg.NEW_CLASS + 5
            self.layers[layer_id].pred[1] = CNNBlock(in_feature, (5 + num_class) * 3, bn_act=False, kernel_size=1)
            # self.layers[layer_id].pred[1].conv.weight[:(5 + old_class) * 3] = old_weight
            num_fea_old = 5 + old_class
            self.layers[layer_id].pred[1].conv.weight[:num_fea_old] = old_weight[:num_fea_old]
            self.layers[layer_id].pred[1].conv.weight[num_fea_old + (num_class - old_class): 2*num_fea_old + (num_class - old_class)] = old_weight[num_fea_old: 2* num_fea_old]
            self.layers[layer_id].pred[1].conv.weight[2* num_fea_old + 2 * (num_class - old_class): 3*num_fea_old + 2 * (num_class - old_class)] = old_weight[2* num_fea_old:]
            self.layers[layer_id].pred[1].conv.bias[:num_fea_old] = old_bias[:num_fea_old]
            self.layers[layer_id].pred[1].conv.bias[num_fea_old + (num_class - old_class): 2*num_fea_old + (num_class - old_class)] = old_bias[num_fea_old: 2* num_fea_old]
            self.layers[layer_id].pred[1].conv.bias[2* num_fea_old + 2 * (num_class - old_class): 3*num_fea_old + 2 * (num_class - old_class)] = old_bias[2* num_fea_old:]

    def forward(self, x):
        outputs = []  # for each scale
        route_connections = []
        self.features = []
        for layer in self.layers:
            if isinstance(layer, ScalePrediction):
                # print(x.shape)
                # print(layer.pred[1].conv.weight.shape)
                outputs.append(layer(x))
                continue

            x = layer(x)

            if isinstance(layer, ResidualBlock) and layer.num_repeats == 8:
                self.features.append(x)
                route_connections.append(x)
            
            elif isinstance(layer, ResidualBlock) and layer.num_repeats == 4:
                self.features.append(x)
            
            elif isinstance(layer, nn.Upsample):
                x = torch.cat([x, route_connections[-1]], dim=1)
                route_connections.pop()

        return outputs

    def _create_conv_layers(self):
        layers = nn.ModuleList()
        in_channels = self.in_channels

        for module in config:
            if isinstance(module, tuple):
                out_channels, kernel_size, stride = module
                layers.append(
                    CNNBlock(
                        in_channels,
                        out_channels,
                        kernel_size=kernel_size,
                        stride=stride,
                        padding=1 if kernel_size == 3 else 0,
                    )
                )
                in_channels = out_channels

            elif isinstance(module, list):
                num_repeats = module[1]
                layers.append(ResidualBlock(in_channels, num_repeats=num_repeats,))

            elif isinstance(module, str):
                if module == "S":
                    layers += [
                        ResidualBlock(in_channels, use_residual=False, num_repeats=1),
                        CNNBlock(in_channels, in_channels // 2, kernel_size=1),
                        ScalePrediction(in_channels // 2, num_classes=self.num_classes),
                    ]
                    in_channels = in_channels // 2

                elif module == "U":
                    layers.append(nn.Upsample(scale_factor=2),)
                    in_channels = in_channels * 3

        return layers


# if __name__ == "__main__":
#     num_classes = 19
#     IMAGE_SIZE = 416
#     model = YOLOv3(num_classes=num_classes)
#     # print(model)
#     print(model.layers[15].pred[1].conv.weight.shape)
#     print(model.layers[15].pred[1].conv.bias.shape)
#     import torch.optim as optim
#     optimizer = optim.Adam(
#         model.parameters(), lr=cfg.LEARNING_RATE, weight_decay=cfg.WEIGHT_DECAY
#     )
#     from utils import load_checkpoint
#     load_checkpoint(
#             cfg.BASE_CHECK_POINT, model, optimizer, cfg.LEARNING_RATE
#     )

#     model.adaptation(layer_id = 15, num_class = 20, in_feature = 1024, old_class = num_classes)
#     model.adaptation(layer_id = 22, num_class = 20, in_feature = 512, old_class = num_classes)
#     model.adaptation(layer_id = 29, num_class = 20, in_feature = 256, old_class = num_classes) 
#     # layer1 = 
#     # model.eval()
#     # with torch.no_grad():
#     #     old_weight = model.layers[15].pred[1].conv.weight
#     #     old_bias = model.layers[15].pred[1].conv.bias
#     #     # print(model.layers[22].pred[1])
#     #     # print(model.layers[29].pred[1])
#     #     # out_dims = cfg.BASE_CLASS + cfg.NEW_CLASS + 5
#     #     model.layers[15].pred[1] = CNNBlock(1024, 25 * 3, bn_act=False, kernel_size=1)
#     #     model.layers[15].pred[1].conv.weight[:72] = old_weight
#     #     model.layers[15].pred[1].conv.bias[:72] = old_bias
#     print(model.layers[15].pred[1].conv.weight.shape)
#     # model.layers[22].pred[1] = CNNBlock(512, out_dims * 3,  kernel_size=1)
#     # model.layers[29].pred[1] = CNNBlock(256, out_dims * 3,  kernel_size=1)
#     x = torch.randn((2, 3, IMAGE_SIZE, IMAGE_SIZE))
#     out = model(x)
#     # assert model(x)[0].shape == (2, 3, IMAGE_SIZE//32, IMAGE_SIZE//32, num_classes + 5)
#     # assert model(x)[1].shape == (2, 3, IMAGE_SIZE//16, IMAGE_SIZE//16, num_classes + 5)
#     # assert model(x)[2].shape == (2, 3, IMAGE_SIZE//8, IMAGE_SIZE//8, num_classes + 5)
#     print("Success!")