Spaces:

chbsaikiran
/

Phi3-VLM-On-Cifar10

Runtime error

App Files Files Community

chbsaikiran commited on 26 days ago

Commit

5e37be9

1 Parent(s): 7eb024a

Intial Commit

Browse files

Files changed (11) hide show

app.py +234 -0
extract_answers.py +45 -0
image_text_proj.pth +3 -0
linear_projection_final.pth +3 -0
phi_model_trained/README.md +202 -0
phi_model_trained/adapter_config.json +36 -0
phi_model_trained/adapter_model.safetensors +3 -0
process_cifar10.py +81 -0
requirements.txt +10 -0
train_linear_projection.py +216 -0
train_phi_with_siglip.py +263 -0

app.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import gradio as gr
+import torch
+import torchvision
+import torchvision.transforms as transforms
+import random
+import numpy as np
+from transformers import (
+    SiglipVisionModel,
+    AutoTokenizer,
+    AutoImageProcessor,
+    AutoModelForCausalLM,
+    BitsAndBytesConfig
+)
+from PIL import Image
+# Initialize device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load models and processors
+def load_models():
+    # Load SigLIP
+    siglip_model = SiglipVisionModel.from_pretrained("google/siglip-so400m-patch14-384").to(device)
+    siglip_processor = AutoImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+    # Load Phi model with 4-bit quantization
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_use_double_quant=False
+    )
+    phi_model = AutoModelForCausalLM.from_pretrained(
+        "phi_model_trained",  # Load from saved directory
+        quantization_config=bnb_config,
+        device_map="auto"
+    )
+    phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+    if phi_tokenizer.pad_token is None:
+        phi_tokenizer.pad_token = phi_tokenizer.eos_token
+    # Load trained projections
+    linear_proj = torch.load('linear_projection_final.pth', map_location=device)
+    image_text_proj = torch.load('image_text_proj.pth', map_location=device)
+    return (siglip_model, siglip_processor, phi_model, phi_tokenizer, linear_proj, image_text_proj)
+# Load all models at startup
+print("Loading models...")
+models = load_models()
+siglip_model, siglip_processor, phi_model, phi_tokenizer, linear_proj, image_text_proj = models
+print("Models loaded successfully!")
+# Load CIFAR10 test dataset
+transform = transforms.Compose([
+    transforms.Resize((384, 384)),
+    transforms.ToTensor(),
+])
+testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
+# Get first 100 images
+first_100_images = [(images, labels) for images, labels in list(testset)[:100]]
+# Questions list
+questions = [
+    "Give a description of the image?",
+    "How does the main object in the image look like?",
+    "How can the main object in the image be useful to humans?",
+    "What is the color of the main object in the image?",
+    "Describe the setting of the image?"
+]
+def get_image_embedding(image, siglip_model, siglip_processor, linear_proj, device):
+    with torch.no_grad():
+        # Process image through SigLIP
+        inputs = siglip_processor(image, return_tensors="pt")
+        # Move inputs to device
+        inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+        outputs = siglip_model(**inputs)
+        image_features = outputs.pooler_output
+        # Project through trained linear layer
+        projected_features = linear_proj(image_features)
+    return projected_features
+def get_random_images():
+    # Select 10 random images from first 100
+    selected_indices = random.sample(range(100), 10)
+    selected_images = [first_100_images[i][0] for i in selected_indices]
+    # Convert to numpy arrays and transpose to correct format (H,W,C)
+    images_np = [img.permute(1, 2, 0).numpy() for img in selected_images]
+    return images_np, selected_indices
+def generate_answer(image_tensor, question_index):
+    if image_tensor is None:
+        return "Please select an image first!"
+    try:
+        # Get image embedding
+        image_embedding = get_image_embedding(
+            image_tensor,
+            siglip_model,
+            siglip_processor,
+            linear_proj,
+            device
+        )
+        # Get question
+        question = questions[question_index]
+        # Tokenize question
+        question_tokens = phi_tokenizer(
+            question,
+            padding=True,
+            truncation=True,
+            max_length=512,
+            return_tensors="pt"
+        ).to(device)
+        # Get question embeddings
+        question_embeds = phi_model.get_input_embeddings()(question_tokens['input_ids'])
+        # Project and prepare image embeddings
+        image_embeds = image_text_proj(image_embedding)
+        image_embeds = image_embeds.unsqueeze(1)
+        # Combine embeddings
+        combined_embedding = torch.cat([
+            image_embeds,
+            question_embeds
+        ], dim=1)
+        # Create attention mask
+        attention_mask = torch.ones(
+            (1, combined_embedding.size(1)),
+            dtype=torch.long,
+            device=device
+        )
+        # Generate answer
+        with torch.no_grad():
+            outputs = phi_model.generate(
+                inputs_embeds=combined_embedding,
+                attention_mask=attention_mask,
+                max_new_tokens=100,
+                num_beams=4,
+                temperature=0.7,
+                do_sample=True,
+                pad_token_id=phi_tokenizer.pad_token_id,
+                eos_token_id=phi_tokenizer.eos_token_id
+            )
+        # Decode the generated answer
+        answer = phi_tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return answer
+    except Exception as e:
+        return f"Error generating answer: {str(e)}"
+# Create Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# CIFAR10 Image Question Answering System")
+    # State variables
+    selected_image_tensor = gr.State(None)
+    image_indices = gr.State([])
+    with gr.Row():
+        with gr.Column():
+            # Button to get random images
+            random_btn = gr.Button("Get Random Images")
+            # Gallery to display images
+            gallery = gr.Gallery(
+                label="Click an image to select it",
+                show_label=True,
+                elem_id="gallery",
+                columns=[5],
+                rows=[2],
+                height="auto",
+                allow_preview=False
+            )
+        with gr.Column():
+            # Display selected image
+            selected_img = gr.Image(label="Selected Image", height=200)
+            # Question buttons
+            q_buttons = []
+            for i, q in enumerate(questions):
+                btn = gr.Button(f"Q{i+1}: {q}")
+                q_buttons.append(btn)
+            # Answer textbox
+            answer_box = gr.Textbox(label="Answer", lines=3)
+    # Handle random image button click
+    def on_random_click():
+        images, indices = get_random_images()
+        return {
+            gallery: images,
+            image_indices: indices,
+            selected_image_tensor: None,
+            selected_img: None,
+            answer_box: ""
+        }
+    random_btn.click(
+        on_random_click,
+        outputs=[gallery, image_indices, selected_image_tensor, selected_img, answer_box]
+    )
+    # Handle image selection
+    def on_image_select(evt: gr.SelectData, images, indices):
+        if images is None or evt.index >= len(images):
+            return None, None, ""
+        selected_idx = indices[evt.index]
+        selected_tensor = first_100_images[selected_idx][0]
+        return selected_tensor, images[evt.index], ""
+    gallery.select(
+        on_image_select,
+        inputs=[gallery, image_indices],
+        outputs=[selected_image_tensor, selected_img, answer_box]
+    )
+    # Handle question button clicks
+    for i, btn in enumerate(q_buttons):
+        btn.click(
+            generate_answer,
+            inputs=[selected_image_tensor, gr.Number(value=i, visible=False)],
+            outputs=answer_box
+        )
+demo.launch()

extract_answers.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+import re
+import glob
+def extract_assistant_answers(input_file):
+    """Extract the text after 'Assistant:' from the input file."""
+    with open(input_file, 'r', encoding='utf-8') as f:
+        content = f.read()
+    # Split content by "Assistant:" to get all sections after it
+    sections = content.split("Assistant:")
+    # Process each section to get clean answers
+    answers = []
+    for section in sections[1:]:  # Skip the first split as it's before first "Assistant:"
+        # Get text up to next "Q" or "User:" or end of string
+        answer = section.split("Q")[0].split("User:")[0].strip()
+        if answer:
+            answers.append(answer)
+    return answers
+def process_all_files():
+    """Process all image_*.txt files in the qa_outputs directory."""
+    # Get all image_*.txt files
+    input_files = glob.glob("qa_outputs/image_*.txt")
+    for input_file in input_files:
+        # Extract the base name without extension
+        base_name = os.path.splitext(input_file)[0]
+        output_file = f"{base_name}_extr.txt"
+        # Extract answers
+        answers = extract_assistant_answers(input_file)
+        # Write answers to the output file
+        with open(output_file, 'w', encoding='utf-8') as f:
+            for i, answer in enumerate(answers, 1):
+                f.write(f"{answer}\n")
+        print(f"Processed {input_file} -> {output_file}")
+if __name__ == "__main__":
+    process_all_files()
+    print("Extraction complete! Check the files with '_extr' suffix.")

image_text_proj.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddb52b49da8704aff3b46f2c503ac20cf64e3e6efbe4844e9ac89e85d9673894
+size 1586824

linear_projection_final.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:374fc454085ce3b227047bfbf45c2df30a97a308b593ad1f2ef5ec763cab5afb
+size 592056

phi_model_trained/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: microsoft/Phi-3-mini-4k-instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.1

phi_model_trained/adapter_config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "mlp.dense_h_to_4h",
+    "mlp.dense_4h_to_h",
+    "self_attn.qkv_proj",
+    "self_attn.dense"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

phi_model_trained/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91ad139ce2f99c0ed85ff06edd5ac6b766baef76fab1d3e896c9ac32589e96fb
+size 25174552

process_cifar10.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+import torchvision
+import torchvision.transforms as transforms
+from PIL import Image
+import os
+from transformers import AutoProcessor, AutoModelForImageTextToText
+from tqdm import tqdm
+# Initialize model and processor
+model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+processor = AutoProcessor.from_pretrained(model_path)
+model = AutoModelForImageTextToText.from_pretrained(
+    model_path,
+    torch_dtype=torch.bfloat16
+    #_attn_implementation="flash_attention_2"
+).to("cuda" if torch.cuda.is_available() else "cpu")
+# Create output directory
+os.makedirs("SigLIP_Training/qa_outputs", exist_ok=True)
+# Load CIFAR-10 dataset
+transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.ToPILImage()
+])
+# Using test set instead of train set
+testset = torchvision.datasets.CIFAR10(root='./data', train=False,
+                                      download=True, transform=transform)
+# List of questions
+questions = [
+    "Give a description of the image?",
+    "How does the main object in the image look like?",
+    "How can the main object in the image be useful to humans?",
+    "What is the color of the main object in the image?",
+    "Describe the setting of the image?"
+]
+def process_image(image, image_idx):
+    # Create output file
+    output_file = f"SigLIP_Training/qa_outputs/image_{image_idx}.txt"
+    with open(output_file, 'w') as f:
+        for q_idx, question in enumerate(questions, 1):
+            # Prepare the message for the model
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": image},
+                        {"type": "text", "text": question}
+                    ]
+                }
+            ]
+            # Process inputs
+            inputs = processor.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt"
+            ).to(model.device, dtype=torch.bfloat16)
+            # Generate answer
+            generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=64)
+            answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            # Write to file in the correct format
+            f.write(f"Q{q_idx}: {question}\n")
+            f.write(f"A{q_idx}: {answer}\n")
+# Process all images from test set
+print(f"Starting to process CIFAR-10 test set images...")
+for idx, (image, _) in enumerate(tqdm(testset)):
+    process_image(image, idx)
+    #if idx >= 1000:  # Process first 1000 test images
+    #    break
+print("Processing complete! Check the SigLIP_Training/qa_outputs directory for results.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch>=2.0.0
+transformers>=4.36.0
+torchvision>=0.15.0
+pillow>=9.3.0
+tqdm>=4.65.0
+numpy>=1.24.0
+accelerate>=0.25.0
+gradio>=4.19.0
+bitsandbytes>=0.41.1
+peft>=0.7.0

train_linear_projection.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim import AdamW
+from transformers import SiglipVisionModel, AutoTokenizer, AutoImageProcessor, AutoModel
+from torchvision.datasets import CIFAR10
+from torch.utils.data import DataLoader, Subset
+import torchvision.transforms as transforms
+from tqdm import tqdm
+import os
+import numpy as np
+from PIL import Image
+import argparse
+def siglip_loss(image_embeddings, text_embeddings, temperature=0.07):
+    # Normalize
+    image_embeddings = F.normalize(image_embeddings, dim=-1)
+    text_embeddings = F.normalize(text_embeddings, dim=-1)
+    # Compute pairwise similarities
+    logits = image_embeddings @ text_embeddings.T  # [batch_size, batch_size]
+    logits = logits / temperature
+    # Ground truth: 1.0 for matching pairs (diagonal), 0.0 for all others
+    batch_size = logits.size(0)
+    targets = torch.eye(batch_size).to(logits.device)
+    # Apply binary cross-entropy with logits
+    loss = F.binary_cross_entropy_with_logits(logits, targets)
+    return loss
+class LinearProjection(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.linear = nn.Linear(input_dim, output_dim)
+    def forward(self, x):
+        return self.linear(x)
+def get_text_embedding(text, tokenizer, device, max_length=128):
+    # Ensure text is not empty and has minimum content
+    if not text or len(text.strip()) == 0:
+        text = "This is a placeholder description."
+    # Tokenize with padding and truncation
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        padding='max_length',  # Changed to max_length padding
+        truncation=True,
+        max_length=max_length  # Fixed max length for all inputs
+    )
+    # Move inputs to device and ensure correct data type
+    inputs = {
+        k: v.to(device).float() for k, v in inputs.items()
+    }
+    # Return the input_ids as embeddings
+    return inputs['input_ids'].float()  # Convert to float for the loss calculation
+def main(num_images=100, batch_size=32, num_epochs=50, learning_rate=1e-4, load_checkpoint=True, checkpoint_path='linear_projection.pth'):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    # Load models and processors
+    siglip_model = SiglipVisionModel.from_pretrained("google/siglip-so400m-patch14-384")
+    siglip_processor = AutoImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+    # Set padding token if not set
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Freeze SigLIP model
+    for param in siglip_model.parameters():
+        param.requires_grad = False
+    siglip_model.to(device)
+    # Get SigLIP output dimension and text embedding dimension
+    # Create a proper dummy image (black image)
+    dummy_image = Image.new('RGB', (384, 384), color='black')
+    with torch.no_grad():
+        siglip_inputs = siglip_processor(dummy_image, return_tensors="pt").to(device)
+        siglip_outputs = siglip_model(**siglip_inputs)
+        siglip_output_dim = siglip_outputs.pooler_output.shape[-1]
+    # Get a sample text to determine embedding dimension
+    dummy_text = "This is a test."
+    dummy_embedding = get_text_embedding(dummy_text, tokenizer, device)
+    text_embedding_dim = dummy_embedding.shape[-1]
+    print(f"SigLIP output dimension: {siglip_output_dim}")
+    print(f"Text embedding dimension: {text_embedding_dim}")
+    # Create linear projection layer
+    linear_proj = LinearProjection(siglip_output_dim, text_embedding_dim).to(device)
+    # Load checkpoint if requested
+    if load_checkpoint:
+        try:
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+            linear_proj.load_state_dict(checkpoint)
+            print(f"Successfully loaded checkpoint from {checkpoint_path}")
+        except Exception as e:
+            print(f"Error loading checkpoint: {e}")
+            print("Starting training from scratch instead.")
+    # Load CIFAR10 test dataset
+    transform = transforms.Compose([
+        transforms.Resize((384, 384)),
+        transforms.ToTensor(),
+    ])
+    test_dataset = CIFAR10(root='./data', train=False, download=True, transform=transform)
+    subset_indices = list(range(num_images))
+    subset_dataset = Subset(test_dataset, subset_indices)
+    dataloader = DataLoader(subset_dataset, batch_size=batch_size, shuffle=False)
+    # Create text files directory if it doesn't exist
+    os.makedirs('qa_outputs', exist_ok=True)
+    # Optimizer
+    optimizer = AdamW(linear_proj.parameters(), lr=learning_rate)
+    # Training loop
+    for epoch in range(num_epochs):
+        total_loss = 0
+        linear_proj.train()
+        progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}')
+        for batch_idx, (images, labels) in enumerate(progress_bar):
+            images = images.to(device)
+            batch_size = images.size(0)
+            # Get image embeddings
+            with torch.no_grad():
+                siglip_inputs = siglip_processor(images, return_tensors="pt").to(device)
+                siglip_outputs = siglip_model(**siglip_inputs)
+                image_features = siglip_outputs.pooler_output
+            # Project image features
+            projected_image_features = linear_proj(image_features)
+            # Process text for each line (1 to 5)
+            total_batch_loss = 0
+            for line_num in range(5):
+                text_embeddings_list = []
+                # Read text from files for current batch
+                for idx in range(batch_size):
+                    global_idx = batch_idx * batch_size + idx
+                    if global_idx < num_images:
+                        file_path = f'qa_outputs/image_{global_idx}_extr.txt'
+                        try:
+                            with open(file_path, 'r') as f:
+                                lines = f.readlines()
+                                text = lines[line_num].strip() if line_num < len(lines) else ""
+                        except:
+                            text = "No description available"
+                        # Get text embeddings directly from tokenizer
+                        text_embedding = get_text_embedding(text, tokenizer, device)
+                        text_embeddings_list.append(text_embedding)
+                if text_embeddings_list:
+                    # Stack instead of cat since all embeddings have same size now
+                    text_embeddings = torch.stack(text_embeddings_list, dim=0).squeeze(1)
+                    loss = siglip_loss(projected_image_features, text_embeddings)
+                    total_batch_loss += loss
+            # Average loss over all text lines
+            avg_batch_loss = total_batch_loss / 5
+            # Backpropagation
+            optimizer.zero_grad()
+            avg_batch_loss.backward()
+            optimizer.step()
+            total_loss += avg_batch_loss.item()
+            progress_bar.set_postfix({'loss': avg_batch_loss.item()})
+        avg_epoch_loss = total_loss / len(dataloader)
+        print(f'Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_epoch_loss:.4f}')
+        # Save checkpoint after each epoch
+        # checkpoint_dir = 'checkpoints'
+        # os.makedirs(checkpoint_dir, exist_ok=True)
+        # checkpoint_file = os.path.join(checkpoint_dir, f'linear_projection_epoch_{epoch+1}.pth')
+        # torch.save(linear_proj.state_dict(), checkpoint_file)
+        # print(f"Saved checkpoint to {checkpoint_file}")
+    # Save final model
+    torch.save(linear_proj.state_dict(), 'linear_projection_final.pth')
+    print("Training completed. Final model saved as 'linear_projection_final.pth'")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Train or continue training the linear projection layer')
+    parser.add_argument('--num_images', type=int, default=100, help='Number of images to train on')
+    parser.add_argument('--batch_size', type=int, default=32, help='Batch size for training')
+    parser.add_argument('--num_epochs', type=int, default=50, help='Number of epochs to train')
+    parser.add_argument('--learning_rate', type=float, default=1e-4, help='Learning rate')
+    parser.add_argument('--load_checkpoint', action='store_true', help='Whether to load from checkpoint')
+    parser.add_argument('--checkpoint_path', type=str, default='linear_projection.pth', help='Path to checkpoint file')
+    args = parser.parse_args()
+    main(
+        num_images=args.num_images,
+        batch_size=args.batch_size,
+        num_epochs=args.num_epochs,
+        learning_rate=args.learning_rate,
+        load_checkpoint=args.load_checkpoint,
+        checkpoint_path=args.checkpoint_path
+    )

train_phi_with_siglip.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import torch
+import torch.nn as nn
+from torch.optim import AdamW
+from transformers import (
+    SiglipVisionModel,
+    AutoTokenizer,
+    AutoImageProcessor,
+    AutoModelForCausalLM,
+    BitsAndBytesConfig
+)
+from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
+from torchvision.datasets import CIFAR10
+from torch.utils.data import DataLoader, Subset
+import torchvision.transforms as transforms
+from tqdm import tqdm
+import os
+from PIL import Image
+class LinearProjection(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.linear = nn.Linear(input_dim, output_dim)
+    def forward(self, x):
+        return self.linear(x)
+class ImageTextProjection(nn.Module):
+    def __init__(self, image_dim, text_dim):
+        super().__init__()
+        self.image_projection = nn.Linear(image_dim, text_dim)
+    def forward(self, x):
+        return self.image_projection(x)
+def get_image_embedding(image, siglip_model, siglip_processor, linear_proj, device):
+    with torch.no_grad():
+        # Process image through SigLIP
+        inputs = siglip_processor(image, return_tensors="pt")
+        # Move inputs to the same device as model
+        inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+        outputs = siglip_model(**inputs)
+        image_features = outputs.pooler_output
+        # Project through trained linear layer
+        projected_features = linear_proj(image_features)
+    return projected_features
+def main(
+    num_images=100,
+    batch_size=4,  # Smaller batch size due to memory constraints
+    num_epochs=100,
+    learning_rate=2e-4,
+    questions=None  # List of 5 questions to be provided
+):
+    if questions is None or len(questions) != 5:
+        print("Please provide exactly 5 questions!")
+        return
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    # Load SigLIP model and processor
+    siglip_model = SiglipVisionModel.from_pretrained("google/siglip-so400m-patch14-384").to(device)
+    siglip_processor = AutoImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+    # Load trained linear projection
+    dummy_image = Image.new('RGB', (384, 384), color='black')
+    with torch.no_grad():
+        siglip_inputs = siglip_processor(dummy_image, return_tensors="pt")
+        # Move inputs to device
+        siglip_inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in siglip_inputs.items()}
+        siglip_outputs = siglip_model(**siglip_inputs)
+        siglip_output_dim = siglip_outputs.pooler_output.shape[-1]
+    # First load the checkpoint to get the correct output dimension
+    checkpoint = torch.load('linear_projection_final.pth', map_location=device)
+    output_dim = checkpoint['linear.weight'].shape[0]  # Get the output dimension from saved weights
+    print(f"Loading linear projection with output dimension: {output_dim}")
+    # Initialize linear projection with correct dimensions
+    linear_proj = LinearProjection(siglip_output_dim, output_dim).to(device)
+    try:
+        linear_proj.load_state_dict(checkpoint)
+        print("Successfully loaded linear projection weights")
+    except Exception as e:
+        print(f"Error loading linear projection weights: {e}")
+        return
+    # Load Phi model with 4-bit quantization
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_use_double_quant=False
+    )
+    phi_model = AutoModelForCausalLM.from_pretrained(
+        "microsoft/Phi-3-mini-4k-instruct",
+        quantization_config=bnb_config,
+        device_map="auto"
+    )
+    phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+    # Add padding token if not present
+    if phi_tokenizer.pad_token is None:
+        phi_tokenizer.pad_token = phi_tokenizer.eos_token
+    # Get embedding dimension from phi model
+    phi_embed_dim = phi_model.get_input_embeddings().weight.shape[1]
+    # Create projection layer for image embeddings
+    image_text_proj = ImageTextProjection(output_dim, phi_embed_dim).to(device)
+    # Prepare model for k-bit training
+    phi_model = prepare_model_for_kbit_training(phi_model)
+    # Setup LoRA configuration
+    lora_config = LoraConfig(
+        r=16,
+        lora_alpha=32,
+        target_modules=["mlp.dense_h_to_4h", "mlp.dense_4h_to_h", "self_attn.qkv_proj", "self_attn.dense"],
+        lora_dropout=0.05,
+        bias="none",
+        task_type="CAUSAL_LM"
+    )
+    # Get PEFT model
+    phi_model = get_peft_model(phi_model, lora_config)
+    # Freeze SigLIP and linear projection
+    for param in siglip_model.parameters():
+        param.requires_grad = False
+    for param in linear_proj.parameters():
+        param.requires_grad = False
+    # Load CIFAR10 test dataset
+    transform = transforms.Compose([
+        transforms.Resize((384, 384)),
+        transforms.ToTensor(),
+    ])
+    test_dataset = CIFAR10(root='./data', train=False, download=True, transform=transform)
+    subset_indices = list(range(num_images))
+    subset_dataset = Subset(test_dataset, subset_indices)
+    dataloader = DataLoader(subset_dataset, batch_size=batch_size, shuffle=False)
+    # Optimizer for both phi model and image projection
+    optimizer = AdamW([
+        {'params': phi_model.parameters()},
+        {'params': image_text_proj.parameters()}
+    ], lr=learning_rate)
+    # Training loop
+    for epoch in range(num_epochs):
+        total_loss = 0
+        phi_model.train()
+        image_text_proj.train()
+        progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}')
+        for batch_idx, (images, _) in enumerate(progress_bar):
+            images = images.to(device)
+            batch_size = images.size(0)
+            # Get image embeddings
+            image_embeddings = get_image_embedding(images, siglip_model, siglip_processor, linear_proj, device)
+            # Process each question
+            for q_idx, question in enumerate(questions):
+                # Read corresponding answers
+                answers = []
+                for idx in range(batch_size):
+                    global_idx = batch_idx * batch_size + idx
+                    if global_idx < num_images:
+                        file_path = f'qa_outputs/image_{global_idx}_extr.txt'
+                        try:
+                            with open(file_path, 'r') as f:
+                                lines = f.readlines()
+                                answer = lines[q_idx].strip() if q_idx < len(lines) else ""
+                                answers.append(answer)
+                        except:
+                            answers.append("No answer available")
+                # Tokenize questions and answers for the entire batch
+                question_tokens = phi_tokenizer(
+                    [question] * batch_size,
+                    padding=True,
+                    truncation=True,
+                    max_length=512,
+                    return_tensors="pt"
+                ).to(device)
+                target_tokens = phi_tokenizer(
+                    answers,
+                    padding=True,
+                    truncation=True,
+                    max_length=512,
+                    return_tensors="pt"
+                ).to(device)
+                # Get question embeddings for the entire batch
+                question_embeds = phi_model.get_input_embeddings()(question_tokens['input_ids'])  # [batch_size, seq_len, embed_dim]
+                # Project and prepare image embeddings for the entire batch
+                image_embeds = image_text_proj(image_embeddings)  # [batch_size, embed_dim]
+                image_embeds = image_embeds.unsqueeze(1)  # [batch_size, 1, embed_dim]
+                # Combine image embeddings with question embeddings
+                combined_embedding = torch.cat([
+                    image_embeds,  # [batch_size, 1, embed_dim]
+                    question_embeds  # [batch_size, seq_len, embed_dim]
+                ], dim=1)  # [batch_size, 1+seq_len, embed_dim]
+                # Create attention mask for the combined sequence
+                attention_mask = torch.ones(
+                    (batch_size, combined_embedding.size(1)),
+                    dtype=torch.long,
+                    device=device
+                )
+                # Prepare labels by shifting them right
+                labels = target_tokens['input_ids'].clone()
+                labels = torch.cat([
+                    torch.full((batch_size, combined_embedding.size(1) - 1), -100, device=device),
+                    labels
+                ], dim=1)[:, :combined_embedding.size(1)]
+                # Forward pass
+                outputs = phi_model(
+                    inputs_embeds=combined_embedding,
+                    attention_mask=attention_mask,
+                    labels=labels
+                )
+                loss = outputs.loss
+                total_loss += loss.item()
+                # Backward pass
+                loss.backward()
+                optimizer.step()
+                optimizer.zero_grad()
+                progress_bar.set_postfix({'loss': loss.item()})
+        avg_epoch_loss = total_loss / (len(dataloader) * len(questions) * batch_size)
+        print(f'Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_epoch_loss:.4f}')
+    # Save the trained models
+    phi_model.save_pretrained('phi_model_trained')
+    torch.save(image_text_proj.state_dict(), 'image_text_proj.pth')
+    print("Training completed. Models saved as 'phi_model_trained' and 'image_text_proj.pth'")
+if __name__ == "__main__":
+    # Example questions - replace with your actual questions
+    questions = [
+    "Give a description of the image?",
+    "How does the main object in the image look like?",
+    "How can the main object in the image be useful to humans?",
+    "What is the color of the main object in the image?",
+    "Describe the setting of the image?"
+    ]
+    main(questions=questions)