Spaces:

KeerthiVM
/

SkinGPT

Running

App Files Files Community

KeerthiVM commited on 15 days ago

Commit

8afe134

1 Parent(s): cca45c8

fix added

Browse files

Files changed (1) hide show

app.py +57 -121

app.py CHANGED Viewed

@@ -115,27 +115,6 @@ class Blip2QFormer(nn.Module):
         return outputs.last_hidden_state
-class LayerNorm(nn.LayerNorm):
-    """Subclass torch's LayerNorm to handle fp16."""
-    def forward(self, x: torch.Tensor):
-        orig_type = x.dtype
-        ret = super().forward(x.type(torch.float32))
-        return ret.type(orig_type)
-class ViTClassifier(nn.Module):
-    def __init__(self, vit, ln_vision, num_labels):
-        super(ViTClassifier, self).__init__()
-        self.vit = vit  # Pretrained ViT from MiniGPT-4
-        self.ln_vision = ln_vision  # LayerNorm from MiniGPT-4
-        self.classifier = nn.Linear(vit.num_features, num_labels)
-    def forward(self, x):
-        features = self.ln_vision(self.vit(x))  # [batch, seq_len, dim]
-        cls_token = features[:, 0, :]  # Extract CLS token
-        return self.classifier(cls_token)
 class SkinGPT4(nn.Module):
     def __init__(self, vit_checkpoint_path,
@@ -161,10 +140,7 @@ class SkinGPT4(nn.Module):
         self.q_former.load_from_pretrained(q_former_model)
         for param in self.q_former.parameters():
             param.requires_grad = False
-        for module in [self.vit, self.ln_vision, self.q_former]:
-            for param in module.parameters():
-                param.requires_grad = False
-            module.eval()
         print("Loaded QFormer")
         self.tokenizer = LlamaTokenizer.from_pretrained(
@@ -185,8 +161,10 @@ class SkinGPT4(nn.Module):
         print(f"Q-Former output dim: {self.q_former.bert_config.hidden_size}")
         print(f"LLaMA input dim: {self.llama.config.hidden_size}")
-        for param in self.llama_proj.parameters():
-            param.requires_grad = False
     def _init_vit(self, vit_checkpoint_path):
         """Initialize EVA-ViT-G with paper specifications"""
@@ -213,9 +191,6 @@ class SkinGPT4(nn.Module):
         # 4. Load weights while ignoring classifier head
         vit.load_state_dict(vit_weights, strict=False)
-        # 5. Freeze according to paper specs
-        for param in vit.parameters():
-            param.requires_grad = False
         return vit.eval()
@@ -226,27 +201,13 @@ class SkinGPT4(nn.Module):
                 "": 0 if torch.cuda.is_available() else "cpu"
             }
             # First try loading with device_map="auto"
-            try:
-                model = LlamaForCausalLM.from_pretrained(
-                    "meta-llama/Llama-2-13b-chat-hf",
-                    token=token,
-                    torch_dtype=torch.float16,
-                    device_map=device_map,
-                    low_cpu_mem_usage=True
-                )
-            except ImportError:
-                # Fallback to CPU-offloading if GPU memory is insufficient
-                with init_empty_weights():
-                    model = LlamaForCausalLM.from_pretrained(
-                        "meta-llama/Llama-2-13b-chat-hf",
-                        token=token,
-                        torch_dtype=torch.float16
-                    )
-                model = model.to(self.device)
-            # Freeze all parameters
-            for param in model.parameters():
-                param.requires_grad = False
             return model.eval()
@@ -259,12 +220,7 @@ class SkinGPT4(nn.Module):
                 f"Original error: {str(e)}"
             )
-    def _init_alignment_projection(self):
-        """Paper specifies Xavier initialization for alignment layer"""
-        nn.init.xavier_normal_(self.llama_proj.weight)
-        nn.init.constant_(self.llama_proj.bias, 0)
-    def _create_patches(self, x):
         """Convert image to patch embeddings following Eq. (1)"""
         # x: (B, C, H, W)
         x = x.to(self.dtype)
@@ -276,69 +232,39 @@ class SkinGPT4(nn.Module):
         B, C, H, W = x.shape
         N = (H * W) // (self.P ** 2)
-        x = self.vit.patch_embed(x)  # (B, N, D)
         num_patches = x.shape[1]
-        pos_embed = self.vit.pos_embed[:, 1:num_patches + 1, :]  # Adjust for exact match
         x = x + pos_embed
         # Add class token
-        class_token = self.vit.cls_token.expand(B, -1, -1)
-        x = torch.cat([class_token, x], dim=1)  # (B, N+1, D)
-        return x
-    def forward_encoder(self, x):
-        """ViT encoder from Eqs. (2)-(3)"""
-        # x: (B, N+1, D)
         for blk in self.vit.blocks:
             x = blk(x)
         x = self.vit.norm(x)
-        x = self.ln_vision(x)
-        return x  # (B, N+1, D)
-    def forward(self, images):
-        images = images.to(self.dtype)
-        x = self._create_patches(images)
-        vit_output = self.forward_encoder(x)
-        with torch.cuda.amp.autocast(enabled=False):
-            qformer_output = self.q_former(vit_output.float())
-        aligned_features = self.llama_proj(qformer_output.to(self.dtype))
-        return aligned_features
-    def add_to_history(self, role, content):
-        self.conversation_history.append({"role": role, "content": content})
-    def get_full_context(self):
-        return "\n".join([f"{msg['role']}: {msg['content']}" for msg in self.conversation_history])
-    def build_prompt(self, image_embeds, user_question=None):
-        # Base prompt for initial diagnosis
-        if not user_question:
-            prompt = (
-                "### Instruction: <Img><ImageHere></Img> "
-                "Could you describe the skin disease in this image for me? "
-                "### Response:"
-            )
-        else:
-            # Follow-up prompt with conversation history
-            history = self.get_full_context()
-            prompt = (
-                f"### Instruction: <Img><ImageHere></Img> "
-                f"Based on our previous conversation:\n{history}\n"
-                f"User asks: {user_question}\n"
-                "### Response:"
-            )
-        return prompt
-    def generate(self, images, user_input=None, max_length=300):
         print("Analysing the image to generate the diagnosis")
-        aligned_features = self.forward(images)
-        print(f"Aligned features : {aligned_features}")
         print("Generated the aligned features with ViT and Qformer")
         prompt = (
-            "<Img><ImageHere></Img> Could you describe the skin disease in this image for me? [/INST]"
         )
         inputs = self.tokenizer(prompt, return_tensors="pt").to(images.device)
         image_token_id = self.tokenizer.convert_tokens_to_ids("<ImageHere>")
@@ -347,28 +273,39 @@ class SkinGPT4(nn.Module):
             raise ValueError("Image token not found in prompt")
         # Prepare embeddings
         input_embeddings = self.llama.model.embed_tokens(inputs.input_ids)
-        # projected_features = self.llama_proj(aligned_features.mean(dim=1, keepdim=True))
-        visual_embeds = aligned_features.mean(dim=1, keepdim=True)  # [1, 1, 5120]
-        visual_embeds = visual_embeds.to(input_embeddings.dtype)
-        print(f"Visual embeddings : {visual_embeds}")
         input_embeddings[image_token_pos] = visual_embeds
-        print(f"input embeddings : {input_embeddings}")
         outputs = self.llama.generate(
             inputs_embeds=input_embeddings,
-            max_new_tokens=max_length,
-            temperature=0.7,
-            top_p=0.9,
-            repetition_penalty=1.2,  # Prevent repetition
             do_sample=True,
-            pad_token_id=self.tokenizer.eos_token_id,
-            eos_token_id=self.tokenizer.eos_token_id
         )
         full_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
         print(f"Output from llama : {full_output}")
-        return full_output.split("[/INST]")[-1].strip()
 class SkinGPTClassifier:
@@ -395,7 +332,6 @@ class SkinGPTClassifier:
         )
         model = SkinGPT4(vit_checkpoint_path=model_path).eval()
         model = model.to(self.device)
-        model.eval()
         return model
     def predict(self, image):
@@ -450,7 +386,7 @@ if uploaded_file:
             else:
                 st.session_state.conversation.append(("assistant", result))
                 with st.chat_message("assistant"):
-                    st.markdown(result)
     else:
         # Follow-up questions
         if user_query := st.chat_input("Ask a follow-up question..."):

         return outputs.last_hidden_state
 class SkinGPT4(nn.Module):
     def __init__(self, vit_checkpoint_path,
         self.q_former.load_from_pretrained(q_former_model)
         for param in self.q_former.parameters():
             param.requires_grad = False
         print("Loaded QFormer")
         self.tokenizer = LlamaTokenizer.from_pretrained(
         print(f"Q-Former output dim: {self.q_former.bert_config.hidden_size}")
         print(f"LLaMA input dim: {self.llama.config.hidden_size}")
+        for module in [self.vit, self.ln_vision, self.q_former, self.llama_proj, self.llama]:
+            for param in module.parameters():
+                param.requires_grad = False
+            module.eval()
     def _init_vit(self, vit_checkpoint_path):
         """Initialize EVA-ViT-G with paper specifications"""
         # 4. Load weights while ignoring classifier head
         vit.load_state_dict(vit_weights, strict=False)
         return vit.eval()
                 "": 0 if torch.cuda.is_available() else "cpu"
             }
             # First try loading with device_map="auto"
+            model = LlamaForCausalLM.from_pretrained(
+                "meta-llama/Llama-2-13b-chat-hf",
+                token=token,
+                torch_dtype=torch.float16,
+                device_map=device_map,
+                low_cpu_mem_usage=True
+            )
             return model.eval()
                 f"Original error: {str(e)}"
             )
+    def encode_image(self, x):
         """Convert image to patch embeddings following Eq. (1)"""
         # x: (B, C, H, W)
         x = x.to(self.dtype)
         B, C, H, W = x.shape
         N = (H * W) // (self.P ** 2)
+        x = self.vit.patch_embed(x)
         num_patches = x.shape[1]
+        pos_embed = self.vit.pos_embed[:, 1:num_patches + 1, :]
         x = x + pos_embed
         # Add class token
+        class_token = self.vit.cls_token.expand(x.shape[0], -1, -1)
+        x = torch.cat([class_token, x], dim=1)
         for blk in self.vit.blocks:
             x = blk(x)
         x = self.vit.norm(x)
+        vit_features = self.ln_vision(x)
+        # Q-Former forward pass
+        with torch.no_grad():
+            qformer_output = self.q_former(vit_features.float())
+            image_embeds = self.llama_proj(qformer_output.to(self.dtype))
+        return image_embeds
+    def generate(self, images, user_input=None, max_new_tokens=300):
         print("Analysing the image to generate the diagnosis")
+        image_embeds = self.encode_image(images)
+        print(f"Aligned features : {image_embeds}")
         print("Generated the aligned features with ViT and Qformer")
         prompt = (
+            "### Instruction: <Img><ImageHere></Img> "
+            "Could you describe the skin condition in this image? "
+            "Please provide a detailed analysis including possible diagnoses. "
+            "### Response:"
         )
         inputs = self.tokenizer(prompt, return_tensors="pt").to(images.device)
         image_token_id = self.tokenizer.convert_tokens_to_ids("<ImageHere>")
             raise ValueError("Image token not found in prompt")
         # Prepare embeddings
         input_embeddings = self.llama.model.embed_tokens(inputs.input_ids)
+        visual_embeds = image_embeds.mean(dim=1, keepdim=True)
         input_embeddings[image_token_pos] = visual_embeds
+        # outputs = self.llama.generate(
+        #     inputs_embeds=input_embeddings,
+        #     max_new_tokens=max_length,
+        #     temperature=0.7,
+        #     top_p=0.9,
+        #     repetition_penalty=1.2,  # Prevent repetition
+        #     do_sample=True,
+        #     pad_token_id=self.tokenizer.eos_token_id,
+        #     eos_token_id=self.tokenizer.eos_token_id
+        # )
         outputs = self.llama.generate(
             inputs_embeds=input_embeddings,
+            max_new_tokens=max_new_tokens,
+            num_beams=1,
             do_sample=True,
+            min_length=1,
+            top_p=0.9,
+            repetition_penalty=1.1,
+            length_penalty=1,
+            temperature=1.0,
+            pad_token_id=self.tokenizer.eos_token_id
         )
         full_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
         print(f"Output from llama : {full_output}")
+        response = full_output.split("### Response:")[-1].strip()
+        return response
 class SkinGPTClassifier:
         )
         model = SkinGPT4(vit_checkpoint_path=model_path).eval()
         model = model.to(self.device)
         return model
     def predict(self, image):
             else:
                 st.session_state.conversation.append(("assistant", result))
                 with st.chat_message("assistant"):
+                    st.markdown(result["diagnosis"])
     else:
         # Follow-up questions
         if user_query := st.chat_input("Ask a follow-up question..."):