File size: 5,250 Bytes
8e34f80
 
 
 
 
 
 
 
 
e981e7f
8e34f80
74fc255
8e34f80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bb8ce2
 
 
 
8e34f80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f51ad44
 
 
 
15b96ac
080099f
 
 
15b96ac
 
bd55da6
 
f51ad44
74fc255
 
 
1e2e099
 
 
 
 
74fc255
 
 
1e2e099
 
 
74fc255
 
 
 
1e2e099
 
 
74fc255
 
e981e7f
bbf9e08
15b96ac
74fc255
 
080099f
 
8e34f80
ee4f4a6
 
 
bbf9e08
ee4f4a6
e17785f
15b96ac
ee4f4a6
4c0fb4c
141404c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
import torch
from PIL import Image
import pandas as pd
from lavis.models import load_model_and_preprocess
from lavis.processors import load_processor
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor
import tensorflow as tf
import tensorflow_hub as hub
import io
from sklearn.metrics.pairwise import cosine_similarity
import tempfile
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load model and preprocessors for Image-Text Matching (LAVIS)
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
model_itm, vis_processors, text_processors = load_model_and_preprocess("blip2_image_text_matching", "pretrain", device=device, is_eval=True)

# Load tokenizer and model for Image Captioning (TextCaps)
git_processor_large_textcaps = AutoProcessor.from_pretrained("microsoft/git-large-r-textcaps")
git_model_large_textcaps = AutoModelForCausalLM.from_pretrained("microsoft/git-large-r-textcaps")

# Load Universal Sentence Encoder model for textual similarity calculation
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Define a function to compute textual similarity between caption and statement
def compute_textual_similarity(caption, statement):
    # Convert caption and statement into sentence embeddings
    caption_embedding = embed([caption])[0].numpy()
    statement_embedding = embed([statement])[0].numpy()

    # Calculate cosine similarity between sentence embeddings
    similarity_score = cosine_similarity([caption_embedding], [statement_embedding])[0][0]
    return similarity_score

# Read statements from the external file 'statements.txt'
with open('statements.txt', 'r') as file:
    statements = file.read().splitlines()

# Function to compute ITM scores for the image-statement pair
def compute_itm_score(image, statement):
    logging.info('Starting compute_itm_score')
    pil_image = Image.fromarray(image.astype('uint8'), 'RGB')
    img = vis_processors["eval"](pil_image.convert("RGB")).unsqueeze(0).to(device)
    # Pass the statement text directly to model_itm
    itm_output = model_itm({"image": img, "text_input": statement}, match_head="itm")
    itm_scores = torch.nn.functional.softmax(itm_output, dim=1)
    score = itm_scores[:, 1].item()
    logging.info('Finished compute_itm_score')
    return score

def generate_caption(processor, model, image):
    logging.info('Starting generate_caption')
    inputs = processor(images=image, return_tensors="pt").to(device)
    generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=50)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    logging.info('Finished generate_caption')
    return generated_caption

def save_dataframe_to_csv(df):
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)
    csv_string = csv_buffer.getvalue()

    # Save the CSV string to a temporary file
    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".csv") as temp_file:
        temp_file.write(csv_string)
        temp_file_path = temp_file.name # Get the file path

    # Return the file path (no need to reopen the file with "rb" mode)
    return temp_file_path

# Main function to perform image captioning and image-text matching for multiple images
def process_images_and_statements(files):
    all_results_list = []
    
    # If 'files' is a list, convert it to a dictionary
    if isinstance(files, list):
        files = {f.name: f for f in files}
    
    for file_name, image in files.items():
        caption = generate_caption(git_processor_large_textcaps, git_model_large_textcaps, image)
        for statement in statements:
            textual_similarity_score = compute_textual_similarity(caption, statement) * 100
            itm_score_statement = compute_itm_score(image, statement) * 100
            final_score = 0.5 * textual_similarity_score + 0.5 * itm_score_statement
            all_results_list.append({
                'Image File Name': file_name,  # Include the image file name
                'Statement': statement,
                'Generated Caption': caption,
                'Textual Similarity Score': f"{textual_similarity_score:.2f}%",
                'ITM Score': f"{itm_score_statement:.2f}%",
                'Final Combined Score': f"{final_score:.2f}%"
            })
    results_df = pd.concat([pd.DataFrame([result]) for result in all_results_list], ignore_index=True)
    csv_results = save_dataframe_to_csv(results_df)
    return results_df, csv_results

# Gradio interface with File input to receive multiple images and file names
image_input = gr.inputs.File(file_count="multiple", type="file", label="Upload Images")
output_df = gr.outputs.Dataframe(type="pandas", label="Results")
output_csv = gr.outputs.File(label="Download CSV")

iface = gr.Interface(
    fn=process_images_and_statements,
    inputs=image_input,
    outputs=[output_df, output_csv],
    title="Image Captioning and Image-Text Matching",
    theme='sudeepshouche/minimalist',
    css=".output { flex-direction: column; } .output .outputs { width: 100%; }" # Custom CSS
)

iface.launch(debug=True)