import gradio as gr import json import requests import urllib.request import os import ssl import base64 import tempfile import edge_tts import re import logging from PIL import Image from io import BytesIO from typing import Dict, List, Optional, Tuple, Union # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Azure ML endpoint configuration - these should be set as environment variables url = os.getenv("AZURE_ENDPOINT") api_key = os.getenv("AZURE_API_KEY") def call_aml_endpoint(payload, url, api_key): """Call Azure ML endpoint with the given payload.""" # Allow self-signed HTTPS certificates def allow_self_signed_https(allowed): if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None): ssl._create_default_https_context = ssl._create_unverified_context allow_self_signed_https(True) # Set parameters (can be adjusted based on your needs) parameters = {"temperature": 0.7} if "parameters" not in payload["input_data"]: payload["input_data"]["parameters"] = parameters # Encode the request body body = str.encode(json.dumps(payload)) if not api_key: raise Exception("A key should be provided to invoke the endpoint") # Set up headers headers = {'Content-Type': 'application/json', 'Authorization': ('Bearer ' + api_key)} # Create and send the request req = urllib.request.Request(url, body, headers) try: logger.info(f"Sending request to {url}") response = urllib.request.urlopen(req) result = response.read().decode('utf-8') logger.info("Received response successfully") return json.loads(result) except urllib.error.HTTPError as error: logger.error(f"Request failed with status code: {error.code}") logger.error(f"Headers: {error.info()}") error_message = error.read().decode("utf8", 'ignore') logger.error(f"Error message: {error_message}") return {"error": error_message} def encode_base64_from_file(file_path): """Encode file content to base64 string and determine MIME type.""" file_extension = os.path.splitext(file_path)[1].lower() # Map file extensions to MIME types if file_extension in ['.jpg', '.jpeg']: mime_type = "image/jpeg" elif file_extension == '.png': mime_type = "image/png" elif file_extension == '.gif': mime_type = "image/gif" elif file_extension in ['.bmp', '.tiff', '.webp']: mime_type = f"image/{file_extension[1:]}" else: mime_type = "image/jpeg" # Default to JPEG # Read and encode file content with open(file_path, "rb") as file: encoded_string = base64.b64encode(file.read()).decode('utf-8') return encoded_string, mime_type class ImageOCRApp: def __init__(self): """Initialize the app with Azure ML endpoint configurations""" # Check if Azure endpoint and key are set if not url or not api_key: logger.warning("Azure ML endpoint or API key not set. Set AZURE_ENDPOINT and AZURE_API_KEY environment variables.") def recognize_text(self, image_path: str) -> str: """Recognize text from the image using Azure ML endpoint""" try: # Encode image to base64 base64_image, mime_type = encode_base64_from_file(image_path) # Prepare prompt for OCR ocr_prompt = "Please identify the handwritten text in the image." # Create content array for the payload content_items = [ {"type": "text", "text": ocr_prompt}, {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}} ] # Create conversation state conversation_state = [ { "role": "user", "content": content_items } ] # Create the payload payload = { "input_data": { "input_string": conversation_state } } # Call Azure ML endpoint response = call_aml_endpoint(payload, url, api_key) # Extract text response from the Azure ML endpoint response if isinstance(response, dict): if "result" in response: result = response["result"] elif "output" in response: # Depending on your API's response format if isinstance(response["output"], list) and len(response["output"]) > 0: result = response["output"][0] else: result = str(response["output"]) elif "error" in response: logger.error(f"Error from Azure ML endpoint: {response['error']}") result = f"Error: {response['error']}" else: # Just return the whole response as string if we can't parse it result = f"Received response: {json.dumps(response)}" else: result = str(response) return result except Exception as e: logger.error(f"Error recognizing text: {str(e)}", exc_info=True) return f"Error recognizing text: {str(e)}" async def text_to_speech(self, text: str, voice: str = "en-US-EricNeural") -> Optional[str]: """Convert text to speech using Edge TTS""" if not text.strip(): return None try: communicate = edge_tts.Communicate(text, voice) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: tmp_path = tmp_file.name await communicate.save(tmp_path) return tmp_path except Exception as e: logger.error(f"TTS Error: {str(e)}") return None def create_interface(self): """Create the Gradio interface""" custom_css = """ .container { max-width: 900px; margin: auto; } .input-section { background: #f8f9fa; padding: 20px; border-radius: 10px; margin-bottom: 20px; } .output-section { background: #ffffff; padding: 20px; border-radius: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); } """ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as interface: # Header gr.Markdown(""" # ✨ Stories Come Alive ### Transform handwritten moments into spoken memories Turn precious handwritten stories, notes, and drawings into living words. Whether it's a child's imaginative tale, a heartfelt letter, or a creative story - let's bring those special handwritten moments to life through sight and sound. 🎨📝🎧. Currently support English. Other demos include [Phi-4-Mini playground](https://huggingface.co/spaces/microsoft/phi-4-mini), [Thoughts Organizer](https://huggingface.co/spaces/microsoft/ThoughtsOrganizer), [Phine Speech Translator](https://huggingface.co/spaces/microsoft/PhineSpeechTranslator). """) with gr.Row(): # Input section with gr.Column(scale=1): image_input = gr.Image( label="Upload or Capture Image", sources=["upload", "webcam"], type="filepath" ) # Example selector gr.Markdown("### Try with Examples") example_images = [ ["content/kid.handwriting.draw.01.jpg", "Tiny Seed"], ["content/race.for.the.moon.jpg", "To the Moon!"], ["content/john.adam.move.to.dc.png", "Move to DC"], ] gr.Examples( examples=example_images, inputs=image_input, label="Example Images" ) with gr.Row(): process_btn = gr.Button("🔍 Recognize Text", variant="primary") clear_btn = gr.Button("đŸ—‘ī¸ Clear", variant="secondary") status_msg = gr.Markdown("Ready to process image...") # Output section with gr.Column(scale=1): recognized_text = gr.Textbox( label="Recognized Text", lines=5, # readonly=True ) tts_audio = gr.Audio( label="Text-to-Speech Output", visible=True, interactive=False ) # Event handlers async def process_image(image): if image is None: return "Please upload or capture an image.", None, "âš ī¸ Please provide an image" # Check if Azure ML endpoint and API key are set if not url or not api_key: return "Azure ML endpoint or API key not set. Please configure the environment variables.", None, "âš ī¸ Configuration error" # Recognize text using Azure ML endpoint text = self.recognize_text(image) if not text or text.strip() == "": return "No text was recognized in the image.", None, "âš ī¸ No text recognized" # Clean up text - replace newlines with spaces and remove multiple spaces cleaned_text = re.sub(r'\s+', ' ', text.replace('\n', ' ')).strip() # Generate audio immediately audio_path = await self.text_to_speech(cleaned_text) return text, audio_path, "✅ Text recognized and audio generated" def clear_inputs(): return None, "", None, "Ready to process image..." process_btn.click( fn=process_image, inputs=[image_input], outputs=[ recognized_text, tts_audio, status_msg ], api_name="process_image" ) clear_btn.click( fn=clear_inputs, inputs=[], outputs=[ image_input, recognized_text, tts_audio, status_msg ], api_name="clear_inputs" ) # Instructions with gr.Accordion("â„šī¸ How to Use", open=False): gr.Markdown(""" 1. **Upload or Capture**: Use your webcam or upload an image containing text 2. **Process**: Click 'Recognize Text' to extract text from the image 3. **Listen**: The audio will automatically play once text is recognized Note: The system works best with clear, well-lit images of handwritten text. ### Configuration Before using this app, set these environment variables: - AZURE_ENDPOINT: Your Azure ML endpoint URL - AZURE_API_KEY: Your Azure ML API key """) return interface def run_app(): app = ImageOCRApp() interface = app.create_interface() interface.launch( share=True, server_name="0.0.0.0", ) if __name__ == "__main__": run_app()