File size: 7,029 Bytes
2fc7f46
 
 
dac012f
2fc7f46
 
 
 
 
dac012f
 
 
 
 
 
2fc7f46
21ef45f
2fc7f46
 
 
 
 
 
 
 
 
 
 
 
 
dac012f
 
 
 
 
2fc7f46
dac012f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0f97b2
dac012f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0f97b2
 
2fc7f46
dac012f
 
 
 
2fc7f46
 
 
 
 
dac012f
2fc7f46
 
dac012f
 
2fc7f46
 
 
 
dac012f
 
2fc7f46
 
ac72f06
 
2fc7f46
 
 
 
 
a886703
ac72f06
 
2fc7f46
dac012f
 
 
2fc7f46
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import gradio as gr
import base64
import anthropic
from openai import OpenAI

# Assuming anthropic is a package that provides an Anthropic client for interacting with Claude
# and it's installed or defined somewhere in your project
from anthropic import Anthropic

def create_image_content(image, MT, detail = "low"):
    return {
        "type": "image_url",
        "image_url": {"url": f"data:{MT};base64,{image}", "detail": detail}
    }

def image_to_base64(image_path):
    """Convert the image to base64."""
    with open(image_path, "rb") as image_file:
        image_data = image_file.read()
        return base64.b64encode(image_data).decode("utf-8")

def get_media_type(image_name):
    """Get the media type of the uploaded image based on its file extension."""
    if image_name.lower().endswith(".jpg") or image_name.lower().endswith(".jpeg"):
        return "image/jpeg"
    elif image_name.lower().endswith(".png"):
        return "image/png"
    else:
        return None  # Extend this function based on the image formats you expect to handle

def set_system_message(sysmsg):
    return [{
        "role": "system",
        "content": sysmsg
    }]

def describe_image(image_path, claude_api_key, openai_api_key, model, prompt):
    """Send the image to the selected model for description."""
    try:
        if model.startswith("claude"):
            # Using Anthropic Claude models
            if not claude_api_key:
                return "Claude API key is required for Claude models."
            client = Anthropic(api_key=claude_api_key)
            message = client.messages.create(
                model=model,
                max_tokens=1024,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image",
                                "source": {
                                    "type": "base64",
                                    "media_type": get_media_type(image_path),
                                    "data": image_to_base64(image_path),
                                },
                            },
                            {
                                "type": "text",
                                "text": prompt
                            }
                        ],
                    }
                ],
            )
            return message.content[0].text
        elif model == "gpt-4-vision Low" or model == "gpt-4-vision High":
            # Using OpenAI GPT-4 Vision
            if not openai_api_key:
                return "OpenAI API key is required for GPT-4 Vision."
            client = OpenAI(api_key = openai_api_key)
            processed_image = image_to_base64(image_path)
            mt = get_media_type(image_path)
            if model == "gpt-4-vision Low":
                detail = "low"#image_content = create_image_content(processed_image, mt)
            else:
                detail = "high"#image_content = create_image_content(processed_image, mt, "high")
                
            system_message = set_system_message("You are GPT-4.")
            response = client.chat.completions.create(
                model="gpt-4-vision-preview",
                messages=system_message + [
                    {
                        "role": "user",
                        "content": [{
                        "type": "image_url",
        "image_url": {"url": f"data:{mt};base64,{processed_image}", "detail": detail}
        }]
                    },
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
                max_tokens=1024
            )
            return response.choices[0].message.content
    except Exception as e:
        return f"Error: {str(e)}"

def main(image_path, claude_api_key, openai_api_key, model_a, model_b, prompt):
    if claude_api_key or openai_api_key:
        description_a = describe_image(image_path, claude_api_key, openai_api_key, model_a, prompt)
        description_b = describe_image(image_path, claude_api_key, openai_api_key, model_b, prompt)
        
        return description_a, description_b
    else:
        return "Please enter a valid API key.", "Please enter a valid API key."

model_options = ["claude-3-opus-20240229", "claude-3-sonnet-20240229", "claude-3-haiku-20240307", "gpt-4-vision Low", "gpt-4-vision High"]

with gr.Blocks() as iface:
    gr.Markdown("# Image Description with Claude Models and GPT-4 Vision")
    gr.Markdown("Drag and drop an image to get descriptions from different models.")
    
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="filepath", label="Upload Image")
            claude_api_key_input = gr.Textbox(type="password", label="Enter your Claude API Key")
            openai_api_key_input = gr.Textbox(type="password", label="Enter your OpenAI API Key")
        
        with gr.Column():
            model_a_dropdown = gr.Dropdown(choices=model_options, label="Model A")
            model_b_dropdown = gr.Dropdown(choices=model_options, label="Model B")
            
            with gr.Row():
                output_a = gr.Textbox(label="Description from Model A")
                output_b = gr.Textbox(label="Description from Model B")
    
    prompt_input = gr.Textbox(label="Custom Prompt", value="As an AI image tagging expert, please provide precise tags for these images to enhance CLIP model's understanding of the content. Employ succinct keywords or phrases, steering clear of elaborate sentences and extraneous conjunctions. Prioritize the tags by relevance. Your tags should capture key elements such as the main subject, setting, artistic style, composition, image quality, color tone, filter, and camera specifications, and any other tags crucial for the image. When tagging photos of people, include specific details like gender, nationality, attire, actions, pose, expressions, accessories, makeup, composition type, age, etc. For other image categories, apply appropriate and common descriptive tags as well. Recognize and tag any celebrities, well-known landmark or IPs if clearly featured in the image. Your tags should be accurate, non-duplicative, and within a 20-75 word count range. These tags will use for image re-creation, so the closer the resemblance to the original image, the better the tag quality. Tags should be comma-separated. Exceptional tagging will be rewarded with $10 per image.")
    run_button = gr.Button("Run")
    
    run_button.click(
        fn=lambda image_path, claude_api_key, openai_api_key, model_a, model_b, prompt: 
            main(image_path, claude_api_key, openai_api_key, model_a, model_b, prompt),
        inputs=[image_input, claude_api_key_input, openai_api_key_input, model_a_dropdown, model_b_dropdown, prompt_input],
        outputs=[output_a, output_b]
    )

if __name__ == "__main__":
    iface.launch()