nguyenbh's picture
Update app
d5d48f0 verified
import gradio as gr
import json
import requests
import urllib.request
import os
import ssl
import base64
import tempfile
import edge_tts
import re
import logging
from PIL import Image
from io import BytesIO
from typing import Dict, List, Optional, Tuple, Union
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Azure ML endpoint configuration - these should be set as environment variables
url = os.getenv("AZURE_ENDPOINT")
api_key = os.getenv("AZURE_API_KEY")
def call_aml_endpoint(payload, url, api_key):
"""Call Azure ML endpoint with the given payload."""
# Allow self-signed HTTPS certificates
def allow_self_signed_https(allowed):
if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
ssl._create_default_https_context = ssl._create_unverified_context
allow_self_signed_https(True)
# Set parameters (can be adjusted based on your needs)
parameters = {"temperature": 0.7}
if "parameters" not in payload["input_data"]:
payload["input_data"]["parameters"] = parameters
# Encode the request body
body = str.encode(json.dumps(payload))
if not api_key:
raise Exception("A key should be provided to invoke the endpoint")
# Set up headers
headers = {'Content-Type': 'application/json', 'Authorization': ('Bearer ' + api_key)}
# Create and send the request
req = urllib.request.Request(url, body, headers)
try:
logger.info(f"Sending request to {url}")
response = urllib.request.urlopen(req)
result = response.read().decode('utf-8')
logger.info("Received response successfully")
return json.loads(result)
except urllib.error.HTTPError as error:
logger.error(f"Request failed with status code: {error.code}")
logger.error(f"Headers: {error.info()}")
error_message = error.read().decode("utf8", 'ignore')
logger.error(f"Error message: {error_message}")
return {"error": error_message}
def encode_base64_from_file(file_path):
"""Encode file content to base64 string and determine MIME type."""
file_extension = os.path.splitext(file_path)[1].lower()
# Map file extensions to MIME types
if file_extension in ['.jpg', '.jpeg']:
mime_type = "image/jpeg"
elif file_extension == '.png':
mime_type = "image/png"
elif file_extension == '.gif':
mime_type = "image/gif"
elif file_extension in ['.bmp', '.tiff', '.webp']:
mime_type = f"image/{file_extension[1:]}"
else:
mime_type = "image/jpeg" # Default to JPEG
# Read and encode file content
with open(file_path, "rb") as file:
encoded_string = base64.b64encode(file.read()).decode('utf-8')
return encoded_string, mime_type
class ImageOCRApp:
def __init__(self):
"""Initialize the app with Azure ML endpoint configurations"""
# Check if Azure endpoint and key are set
if not url or not api_key:
logger.warning("Azure ML endpoint or API key not set. Set AZURE_ENDPOINT and AZURE_API_KEY environment variables.")
def recognize_text(self, image_path: str) -> str:
"""Recognize text from the image using Azure ML endpoint"""
try:
# Encode image to base64
base64_image, mime_type = encode_base64_from_file(image_path)
# Prepare prompt for OCR
ocr_prompt = "Please identify the handwritten text in the image."
# Create content array for the payload
content_items = [
{"type": "text", "text": ocr_prompt},
{"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}}
]
# Create conversation state
conversation_state = [
{
"role": "user",
"content": content_items
}
]
# Create the payload
payload = {
"input_data": {
"input_string": conversation_state
}
}
# Call Azure ML endpoint
response = call_aml_endpoint(payload, url, api_key)
# Extract text response from the Azure ML endpoint response
if isinstance(response, dict):
if "result" in response:
result = response["result"]
elif "output" in response:
# Depending on your API's response format
if isinstance(response["output"], list) and len(response["output"]) > 0:
result = response["output"][0]
else:
result = str(response["output"])
elif "error" in response:
logger.error(f"Error from Azure ML endpoint: {response['error']}")
result = f"Error: {response['error']}"
else:
# Just return the whole response as string if we can't parse it
result = f"Received response: {json.dumps(response)}"
else:
result = str(response)
return result
except Exception as e:
logger.error(f"Error recognizing text: {str(e)}", exc_info=True)
return f"Error recognizing text: {str(e)}"
async def text_to_speech(self, text: str, voice: str = "en-US-EricNeural") -> Optional[str]:
"""Convert text to speech using Edge TTS"""
if not text.strip():
return None
try:
communicate = edge_tts.Communicate(text, voice)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path
except Exception as e:
logger.error(f"TTS Error: {str(e)}")
return None
def create_interface(self):
"""Create the Gradio interface"""
custom_css = """
.container { max-width: 900px; margin: auto; }
.input-section {
background: #f8f9fa;
padding: 20px;
border-radius: 10px;
margin-bottom: 20px;
}
.output-section {
background: #ffffff;
padding: 20px;
border-radius: 10px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
"""
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as interface:
# Header
gr.Markdown("""
# ✨ Stories Come Alive
### Transform handwritten moments into spoken memories
Turn precious handwritten stories, notes, and drawings into living words.
Whether it's a child's imaginative tale, a heartfelt letter, or a creative
story - let's bring those special handwritten moments to life through sight
and sound. πŸŽ¨πŸ“πŸŽ§. Currently support English. Other demos include [Phi-4-Mini playground](https://huggingface.co/spaces/microsoft/phi-4-mini),
[Thoughts Organizer](https://huggingface.co/spaces/microsoft/ThoughtsOrganizer),
[Phine Speech Translator](https://huggingface.co/spaces/microsoft/PhineSpeechTranslator).
""")
with gr.Row():
# Input section
with gr.Column(scale=1):
image_input = gr.Image(
label="Upload or Capture Image",
sources=["upload", "webcam"],
type="filepath"
)
# Example selector
gr.Markdown("### Try with Examples")
example_images = [
["content/kid.handwriting.draw.01.jpg", "Tiny Seed"],
["content/race.for.the.moon.jpg", "To the Moon!"],
["content/john.adam.move.to.dc.png", "Move to DC"],
]
gr.Examples(
examples=example_images,
inputs=image_input,
label="Example Images"
)
with gr.Row():
process_btn = gr.Button("πŸ” Recognize Text", variant="primary")
clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
status_msg = gr.Markdown("Ready to process image...")
# Output section
with gr.Column(scale=1):
recognized_text = gr.Textbox(
label="Recognized Text",
lines=5,
# readonly=True
)
tts_audio = gr.Audio(
label="Text-to-Speech Output",
visible=True,
interactive=False
)
# Event handlers
async def process_image(image):
if image is None:
return "Please upload or capture an image.", None, "⚠️ Please provide an image"
# Check if Azure ML endpoint and API key are set
if not url or not api_key:
return "Azure ML endpoint or API key not set. Please configure the environment variables.", None, "⚠️ Configuration error"
# Recognize text using Azure ML endpoint
text = self.recognize_text(image)
if not text or text.strip() == "":
return "No text was recognized in the image.", None, "⚠️ No text recognized"
# Clean up text - replace newlines with spaces and remove multiple spaces
cleaned_text = re.sub(r'\s+', ' ', text.replace('\n', ' ')).strip()
# Generate audio immediately
audio_path = await self.text_to_speech(cleaned_text)
return text, audio_path, "βœ… Text recognized and audio generated"
def clear_inputs():
return None, "", None, "Ready to process image..."
process_btn.click(
fn=process_image,
inputs=[image_input],
outputs=[
recognized_text,
tts_audio,
status_msg
],
api_name="process_image"
)
clear_btn.click(
fn=clear_inputs,
inputs=[],
outputs=[
image_input,
recognized_text,
tts_audio,
status_msg
],
api_name="clear_inputs"
)
# Instructions
with gr.Accordion("ℹ️ How to Use", open=False):
gr.Markdown("""
1. **Upload or Capture**: Use your webcam or upload an image containing text
2. **Process**: Click 'Recognize Text' to extract text from the image
3. **Listen**: The audio will automatically play once text is recognized
Note: The system works best with clear, well-lit images of handwritten text.
### Configuration
Before using this app, set these environment variables:
- AZURE_ENDPOINT: Your Azure ML endpoint URL
- AZURE_API_KEY: Your Azure ML API key
""")
return interface
def run_app():
app = ImageOCRApp()
interface = app.create_interface()
interface.launch(
share=True,
server_name="0.0.0.0",
)
if __name__ == "__main__":
run_app()