File size: 1,946 Bytes
494a766
695027d
 
964c03d
5809b85
964c03d
695027d
494a766
 
 
 
695027d
964c03d
494a766
964c03d
494a766
 
 
 
964c03d
 
 
85a339a
964c03d
494a766
5809b85
 
 
 
 
 
 
 
494a766
5809b85
 
 
 
964c03d
494a766
5809b85
 
964c03d
695027d
 
494a766
 
 
 
 
5809b85
494a766
 
 
 
695027d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import gradio as gr
from transformers import pipeline
from huggingface_hub import InferenceClient
from PIL import Image
import io

# Get the token from environment variable
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
    raise ValueError("HF_TOKEN environment variable is not set. Please set it with your Hugging Face API token.")

# Initialize both local pipeline and Inference Client
local_extractor = pipeline("image-to-text", model="microsoft/git-base-textcaps")
api_client = InferenceClient(model="microsoft/git-base-textcaps", token=hf_token)

# Flag to track which mode is active
use_api = False

def switch_mode():
    global use_api
    use_api = not use_api
    return "Using API" if use_api else "Using Local Model (Slow unless duplicated and run on GPU"

def extract_text(image, mode_indicator):
    # Convert image to PNG if it's not already
    if image.format != 'PNG':
        png_buffer = io.BytesIO()
        image.save(png_buffer, format='PNG')
        png_image = Image.open(png_buffer)
    else:
        png_image = image

    if "API" in mode_indicator:
        # Convert PIL Image to bytes
        buffered = io.BytesIO()
        png_image.save(buffered, format="PNG")
        img_bytes = buffered.getvalue()
        result = api_client.image_to_text(image=img_bytes)
    else:
        result = local_extractor(png_image)
    
    return result[0]['generated_text'] if isinstance(result, list) else result

# Create the Gradio interface
with gr.Blocks() as iface:
    gr.Markdown("# Image Text Extractor")
    with gr.Row():
        image_input = gr.Image(type="pil")
        text_output = gr.Textbox()
    mode_button = gr.Button("Switch Mode")
    mode_indicator = gr.Textbox(value="Using Local Model", label="Current Mode")
    
    mode_button.click(switch_mode, outputs=mode_indicator)
    image_input.change(extract_text, inputs=[image_input, mode_indicator], outputs=text_output)

iface.launch()