""" 2024-11-10 15:29:50 """ import torchvision.transforms as transforms import sys import os import torch from torch.autograd import Variable import torch.nn as nn import torch.backends.cudnn as cudnn import torch.optim as optim from datetime import datetime import multiprocessing from transformers import ViTModel, ViTConfig from sklearn.metrics import f1_score from sklearn.model_selection import KFold import numpy as np from collections import Counter from torch.optim.lr_scheduler import StepLR from PIL import Image import torch.nn.functional as F class PatchEmbedding(nn.Module): def __init__(self, img_size, patch_size, in_channels, embed_dim, hidden_dim): super(PatchEmbedding, self).__init__() # self.patch_embed = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size) self.patch_embed = nn.Conv2d(in_channels, hidden_dim, kernel_size=patch_size, stride=patch_size) self.num_patches = (img_size // patch_size) ** 2 def forward(self, x): x = self.patch_embed(x).flatten(2).transpose(1, 2) # (batch_size, num_patches, embed_dim) return x class PositionalEncoding(nn.Module): def __init__(self, num_patches, embed_dim, hidden_dim): super(PositionalEncoding, self).__init__() self.positional_encoding = nn.Parameter(torch.randn(1, num_patches, hidden_dim)) def forward(self, x): return x + self.positional_encoding class TransformerLayer(nn.Module): def __init__(self, hidden_dim, num_heads, mlp_dim, dropout_rate): super(TransformerLayer, self).__init__() self.attention = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout_rate) self.mlp = nn.Sequential( nn.Linear(hidden_dim, mlp_dim), nn.GELU(), nn.Dropout(dropout_rate), nn.Linear(mlp_dim, hidden_dim), nn.Dropout(dropout_rate) ) self.norm1 = nn.LayerNorm(hidden_dim) self.norm2 = nn.LayerNorm(hidden_dim) def forward(self, x): attn_out, _ = self.attention(x, x, x) x = self.norm1(x + attn_out) x = self.norm2(x + self.mlp(x)) return x # EvoViTModel class for building Vision Transformer model class EvoViTModel(nn.Module): def __init__(self, img_size, patch_size, in_channels, embed_dim, num_classes, hidden_dim): super(EvoViTModel, self).__init__() self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim, hidden_dim) self.position_encoding = PositionalEncoding(self.patch_embed.num_patches, embed_dim, hidden_dim) self.sigmoid = nn.Sigmoid() # Placeholder for dynamically generated init: # Transformer Layer Initialization self.transformer_layer_0 = TransformerLayer(num_heads=8, mlp_dim=2048, hidden_dim=512, dropout_rate=0.20362387412323335) self.transformer_layer_1 = TransformerLayer(num_heads=8, mlp_dim=3072, hidden_dim=512, dropout_rate=0.29859399476669696) self.transformer_layer_2 = TransformerLayer(num_heads=16, mlp_dim=4096, hidden_dim=512, dropout_rate=0.24029622136332746) self.transformer_layer_3 = TransformerLayer(num_heads=8, mlp_dim=2048, hidden_dim=512, dropout_rate=0.22640265738407994) self.transformer_layer_4 = TransformerLayer(num_heads=16, mlp_dim=3072, hidden_dim=512, dropout_rate=0.2969787366320388) self.transformer_layer_5 = TransformerLayer(num_heads=16, mlp_dim=2048, hidden_dim=512, dropout_rate=0.11264741089870321) self.transformer_layer_6 = TransformerLayer(num_heads=8, mlp_dim=4096, hidden_dim=512, dropout_rate=0.25324312813345734) self.transformer_layer_7 = TransformerLayer(num_heads=8, mlp_dim=2048, hidden_dim=512, dropout_rate=0.17729069086242882) self.transformer_layer_8 = TransformerLayer(num_heads=8, mlp_dim=2048, hidden_dim=512, dropout_rate=0.2531553780827078) self.transformer_layer_9 = TransformerLayer(num_heads=16, mlp_dim=2048, hidden_dim=512, dropout_rate=0.17372554665581236) self.transformer_layer_10 = TransformerLayer(num_heads=16, mlp_dim=3072, hidden_dim=512, dropout_rate=0.25217233180956183) self.transformer_layer_11 = TransformerLayer(num_heads=8, mlp_dim=4096, hidden_dim=512, dropout_rate=0.24459590331387862) self.transformer_layer_12 = TransformerLayer(num_heads=8, mlp_dim=2048, hidden_dim=512, dropout_rate=0.17589263405869232) self.classifier = nn.Linear(512, 48) def forward(self, x): expected_dtype = self.patch_embed.patch_embed .weight.dtype if x.dtype != expected_dtype: x = x.to(expected_dtype) x = self.patch_embed(x) x = self.position_encoding(x) # Pass through additional transformer layers # Placeholder for dynamically generated forward pass: x = self.transformer_layer_0(x) x = self.transformer_layer_1(x) x = self.transformer_layer_2(x) x = self.transformer_layer_3(x) x = self.transformer_layer_4(x) x = self.transformer_layer_5(x) x = self.transformer_layer_6(x) x = self.transformer_layer_7(x) x = self.transformer_layer_8(x) x = self.transformer_layer_9(x) x = self.transformer_layer_10(x) x = self.transformer_layer_11(x) x = self.transformer_layer_12(x) x = self.classifier(x[:, 0]) #probs = self.sigmoid(x) #return probs return x