Document_OCR_Demo

Sleeping

File size: 3,802 Bytes

0107a69
 
 
 
 
4ef4a96
226a773
e611d39
0107a69
e611d39
0107a69
f5a3b55
89ed063
0107a69
226a773
4ef4a96
0107a69
89ed063
 
 
4ef4a96
0107a69
89ed063
0107a69
4ef4a96
0107a69
4ef4a96
89ed063
0107a69
4ef4a96
0107a69
 
4ef4a96
0107a69
 
89ed063
f5a3b55
e611d39
c2af4d9
e611d39
0107a69
e611d39
0107a69
4ef4a96
226a773
4ef4a96
 
f5a3b55
0107a69
a509a74
0107a69
 
 
c2af4d9
a509a74
f5a3b55
0107a69
4ef4a96
0107a69
 
 
3f1998d
4ef4a96
f5a3b55
0107a69
4ef4a96
 
f5a3b55
0107a69
f5a3b55
0107a69
 
 
 
4ef4a96
0107a69
226a773
 
0107a69
4ef4a96
0107a69
4ef4a96
0107a69
f5a3b55
0107a69
4ef4a96
ef69291
0107a69
ef69291
4ef4a96
0107a69
ef69291
4ef4a96
0107a69
ef69291
4ef4a96
0107a69
4ef4a96
0107a69

# Import necessary libraries
import gradio as gr  # Gradio: Library for building web interfaces
import requests       # Library for sending API requests
from openai import OpenAI  # OpenAI-compatible client for using Upstage Solar LLM
from io import BytesIO      # Tool for handling image data in memory

def extract_text_from_image(image, api_key):
    """
    Function to extract text from an image (using Upstage Document OCR API)
    """
    # Upstage API Endpoint
    url = "https://api.upstage.ai/v1/document-digitization"  

    # Set up headers for API Key authentication
    headers = {'Authorization': f'Bearer {api_key}'}  

    # Save the image to a memory buffer (JPEG format)
    buffer = BytesIO()
    image.save(buffer, format="JPEG")
    buffer.seek(0)

    # Prepare files and data for the request
    files = {"document": ("image.jpg", buffer, "image/jpeg")}
    data = {"model": "ocr"}  # Model to use: OCR

    # Send POST request
    response = requests.post(url, headers=headers, files=files, data=data)

    # If request is successful, extract text
    if response.status_code == 200:
        text = response.json().get("text", "")  # Extract text from JSON response
        return text.strip()  # Remove leading/trailing whitespace and return
    else:
        # Return error message on failure
        return f"OCR Failed: {response.status_code} - {response.text}"



def translate_text_with_solar(korean_text, api_key):
    """
    Function to translate Korean text into English (using Upstage Solar Pro API)
    """
    # Initialize OpenAI client for calling Solar LLM
    client = OpenAI(
        api_key=api_key,
        base_url="https://api.upstage.ai/v1"
    )

    # Construct prompt for the model
    prompt = f"""
    Below is a handwritten letter in Korean.\n
    {korean_text} \n
    Please translate it into English.\n\n  
    Translated letter in English: 
    """

    # Call Solar LLM to perform translation
    response = client.chat.completions.create(
        model="solar-pro",  # Model to use
        messages=[{"role": "user", "content": prompt}],  # User message
        temperature=0.5,     # Creativity level (0.0~1.0)
        max_tokens=2048      # Max response length
    )

    # Return translated text
    return response.choices[0].message.content


# Gradio interface layout
with gr.Blocks() as demo:
    # Header description
    gr.Markdown("# 💌 Handwritten Letter Translator")
    gr.Markdown("Upload a letter image to extract Korean text using Upstage Document OCR.\nClick the 🌐 Translate button to translate it into English using Solar LLM!")
    gr.Markdown("The example images are AI-generated. Click the Files button to view or download them.")

    # ✅ API Key input
    api_key_input = gr.Textbox(label="🔑 Upstage API Key", type="password", placeholder="Paste your API key here")

    # Layout: 2-column format
    with gr.Row():
        # Left column: image upload
        with gr.Column(scale=1):
            image_input = gr.Image(type="pil", label=" 💌 Upload Letter Image")

        # Right column: extracted text and translation
        with gr.Column(scale=2):
            korean_box = gr.Textbox(label="📝 Extracted Korean Text", lines=10)
            translate_button = gr.Button("🌐 Translate")
            english_box = gr.Textbox(label="Translated English Text", lines=10)

    # Step 1: Run OCR when image is uploaded → display extracted text
    image_input.change(fn=extract_text_from_image, inputs=[image_input, api_key_input], outputs=korean_box)

    # Step 2: Run translation when button is clicked → display translated result
    translate_button.click(fn=translate_text_with_solar, inputs=[korean_box, api_key_input], outputs=english_box)

# Run app
if __name__ == "__main__":
    demo.launch()