Spaces:

microsoft
/

StoriesComeAlive

Build error

File size: 12,396 Bytes

import gradio as gr
import json
import requests
import urllib.request
import os
import ssl
import base64
import tempfile
import edge_tts
import re
import logging
from PIL import Image
from io import BytesIO
from typing import Dict, List, Optional, Tuple, Union

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Azure ML endpoint configuration - these should be set as environment variables
url = os.getenv("AZURE_ENDPOINT")
api_key = os.getenv("AZURE_API_KEY")


def call_aml_endpoint(payload, url, api_key):
    """Call Azure ML endpoint with the given payload."""
    # Allow self-signed HTTPS certificates
    def allow_self_signed_https(allowed):
        if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
            ssl._create_default_https_context = ssl._create_unverified_context

    allow_self_signed_https(True)
    
    # Set parameters (can be adjusted based on your needs)
    parameters = {"temperature": 0.7}
    if "parameters" not in payload["input_data"]:
        payload["input_data"]["parameters"] = parameters
    
    # Encode the request body
    body = str.encode(json.dumps(payload))
    
    if not api_key:
        raise Exception("A key should be provided to invoke the endpoint")

    # Set up headers
    headers = {'Content-Type': 'application/json', 'Authorization': ('Bearer ' + api_key)}
    
    # Create and send the request
    req = urllib.request.Request(url, body, headers)

    try:
        logger.info(f"Sending request to {url}")
        response = urllib.request.urlopen(req)
        result = response.read().decode('utf-8')
        logger.info("Received response successfully")
        return json.loads(result)
    except urllib.error.HTTPError as error:
        logger.error(f"Request failed with status code: {error.code}")
        logger.error(f"Headers: {error.info()}")
        error_message = error.read().decode("utf8", 'ignore')
        logger.error(f"Error message: {error_message}")
        return {"error": error_message}

def encode_base64_from_file(file_path):
    """Encode file content to base64 string and determine MIME type."""
    file_extension = os.path.splitext(file_path)[1].lower()
    
    # Map file extensions to MIME types
    if file_extension in ['.jpg', '.jpeg']:
        mime_type = "image/jpeg"
    elif file_extension == '.png':
        mime_type = "image/png"
    elif file_extension == '.gif':
        mime_type = "image/gif"
    elif file_extension in ['.bmp', '.tiff', '.webp']:
        mime_type = f"image/{file_extension[1:]}"
    else:
        mime_type = "image/jpeg"  # Default to JPEG
    
    # Read and encode file content
    with open(file_path, "rb") as file:
        encoded_string = base64.b64encode(file.read()).decode('utf-8')
    
    return encoded_string, mime_type

class ImageOCRApp:
    def __init__(self):
        """Initialize the app with Azure ML endpoint configurations"""
        # Check if Azure endpoint and key are set
        if not url or not api_key:
            logger.warning("Azure ML endpoint or API key not set. Set AZURE_ENDPOINT and AZURE_API_KEY environment variables.")
            
    def recognize_text(self, image_path: str) -> str:
        """Recognize text from the image using Azure ML endpoint"""
        try:
            # Encode image to base64
            base64_image, mime_type = encode_base64_from_file(image_path)
            
            # Prepare prompt for OCR
            ocr_prompt = "Please identify the handwritten text in the image."
            
            # Create content array for the payload
            content_items = [
                {"type": "text", "text": ocr_prompt},
                {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}}
            ]
            
            # Create conversation state
            conversation_state = [
                {
                    "role": "user",
                    "content": content_items
                }
            ]
            
            # Create the payload
            payload = {
                "input_data": {
                    "input_string": conversation_state
                }
            }
            
            # Call Azure ML endpoint
            response = call_aml_endpoint(payload, url, api_key)
            
            # Extract text response from the Azure ML endpoint response
            if isinstance(response, dict):
                if "result" in response:
                    result = response["result"]
                elif "output" in response:
                    # Depending on your API's response format
                    if isinstance(response["output"], list) and len(response["output"]) > 0:
                        result = response["output"][0]
                    else:
                        result = str(response["output"])
                elif "error" in response:
                    logger.error(f"Error from Azure ML endpoint: {response['error']}")
                    result = f"Error: {response['error']}"
                else:
                    # Just return the whole response as string if we can't parse it
                    result = f"Received response: {json.dumps(response)}"
            else:
                result = str(response)
                
            return result
            
        except Exception as e:
            logger.error(f"Error recognizing text: {str(e)}", exc_info=True)
            return f"Error recognizing text: {str(e)}"

    async def text_to_speech(self, text: str, voice: str = "en-US-EricNeural") -> Optional[str]:
        """Convert text to speech using Edge TTS"""
        if not text.strip():
            return None
        
        try:
            communicate = edge_tts.Communicate(text, voice)
            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                tmp_path = tmp_file.name
                await communicate.save(tmp_path)
                return tmp_path
        except Exception as e:
            logger.error(f"TTS Error: {str(e)}")
            return None

    def create_interface(self):
        """Create the Gradio interface"""
        custom_css = """
            .container { max-width: 900px; margin: auto; }
            .input-section { 
                background: #f8f9fa;
                padding: 20px;
                border-radius: 10px;
                margin-bottom: 20px;
            }
            .output-section {
                background: #ffffff;
                padding: 20px;
                border-radius: 10px;
                box-shadow: 0 2px 4px rgba(0,0,0,0.1);
            }
        """

        with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as interface:
            # Header
            gr.Markdown("""
                # ✨ Stories Come Alive
                ### Transform handwritten moments into spoken memories
                
                Turn precious handwritten stories, notes, and drawings into living words. 
                Whether it's a child's imaginative tale, a heartfelt letter, or a creative 
                story - let's bring those special handwritten moments to life through sight 
                and sound. 🎨📝🎧. Currently support English. Other demos include [Phi-4-Mini playground](https://huggingface.co/spaces/microsoft/phi-4-mini), 
                [Thoughts Organizer](https://huggingface.co/spaces/microsoft/ThoughtsOrganizer), 
                [Phine Speech Translator](https://huggingface.co/spaces/microsoft/PhineSpeechTranslator).
            """)

            with gr.Row():
                # Input section
                with gr.Column(scale=1):
                    image_input = gr.Image(
                        label="Upload or Capture Image",
                        sources=["upload", "webcam"],
                        type="filepath"
                    )
                    
                    # Example selector
                    gr.Markdown("### Try with Examples")
                    example_images = [
                        ["content/kid.handwriting.draw.01.jpg", "Tiny Seed"],
                        ["content/race.for.the.moon.jpg", "To the Moon!"],
                        ["content/john.adam.move.to.dc.png", "Move to DC"],
                    ]
                    gr.Examples(
                        examples=example_images,
                        inputs=image_input,
                        label="Example Images"
                    )
                    
                    with gr.Row():
                        process_btn = gr.Button("🔍 Recognize Text", variant="primary")
                        clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                    status_msg = gr.Markdown("Ready to process image...")
                    
                # Output section
                with gr.Column(scale=1):
                    recognized_text = gr.Textbox(
                        label="Recognized Text",
                        lines=5,
                        # readonly=True
                    )
                    
                    tts_audio = gr.Audio(
                        label="Text-to-Speech Output",
                        visible=True,
                        interactive=False
                    )

            # Event handlers
            async def process_image(image):
                if image is None:
                    return "Please upload or capture an image.", None, "⚠️ Please provide an image"
                
                # Check if Azure ML endpoint and API key are set
                if not url or not api_key:
                    return "Azure ML endpoint or API key not set. Please configure the environment variables.", None, "⚠️ Configuration error"
                
                # Recognize text using Azure ML endpoint
                text = self.recognize_text(image)
                
                if not text or text.strip() == "":
                    return "No text was recognized in the image.", None, "⚠️ No text recognized"
                
                # Clean up text - replace newlines with spaces and remove multiple spaces
                cleaned_text = re.sub(r'\s+', ' ', text.replace('\n', ' ')).strip()
                
                # Generate audio immediately
                audio_path = await self.text_to_speech(cleaned_text)
                
                return text, audio_path, "✅ Text recognized and audio generated"

            def clear_inputs():
                return None, "", None, "Ready to process image..."

            process_btn.click(
                fn=process_image,
                inputs=[image_input],
                outputs=[
                    recognized_text,
                    tts_audio,
                    status_msg
                ],
                api_name="process_image"
            )

            clear_btn.click(
                fn=clear_inputs,
                inputs=[],
                outputs=[
                    image_input,
                    recognized_text,
                    tts_audio,
                    status_msg
                ],
                api_name="clear_inputs"
            )

            # Instructions
            with gr.Accordion("ℹ️ How to Use", open=False):
                gr.Markdown("""
                    1. **Upload or Capture**: Use your webcam or upload an image containing text
                    2. **Process**: Click 'Recognize Text' to extract text from the image
                    3. **Listen**: The audio will automatically play once text is recognized
                    
                    Note: The system works best with clear, well-lit images of handwritten text.
                    
                    ### Configuration
                    Before using this app, set these environment variables:
                    - AZURE_ENDPOINT: Your Azure ML endpoint URL
                    - AZURE_API_KEY: Your Azure ML API key
                """)

        return interface

def run_app():
    app = ImageOCRApp()
    interface = app.create_interface()
    interface.launch(
        share=True,
        server_name="0.0.0.0",
    )
    
if __name__ == "__main__":
    run_app()