AutoAssess / src /transcribe_image.py
TensorFlo's picture
bug fix
f77abbc
raw
history blame
2.5 kB
import base64
from openai import OpenAI
import os
from PIL import Image
import io
def resize_image(image, max_size=800):
"""Resize the image to ensure the max side length is `max_size` while maintaining aspect ratio."""
# Open the image using Pillow
img = Image.open(image)
print(img.size)
# Get the current width and height of the image
width, height = img.size
# Resize the image if necessary
if width > height:
new_width = max_size
new_height = int((new_width / width) * height)
else:
new_height = max_size
new_width = int((new_height / height) * width)
# Resize the image using the LANCZOS filter for high-quality rescaling
img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
print('after resizing', img.size)
# Save the resized image to a BytesIO object to later encode to base64
img_byte_arr = io.BytesIO()
img.save(img_byte_arr, format='PNG')
img_byte_arr.seek(0) # Rewind the BytesIO object to the beginning
return img_byte_arr
# Function to encode the image
def encode_image(image_path):
assert os.path.exists(image_path), "The image file does not exist."
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def encode_image_from_uploaded_file(image):
# Convert image to bytes
assert image is not None, "No image uploaded."
resized_image = resize_image(image)
image_bytes = resized_image.read()
return base64.b64encode(image_bytes).decode('utf-8')
def transcribe_image(image_file):
"""Transcribe handwritten text from an image using OCR."""
# Initialize the OpenAI client
client = OpenAI()
# Encoding the image
base64_image = encode_image_from_uploaded_file(image_file)
# Preparing the API call
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Please transcribe the handwritten text in this image. Return only the text content."},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
}
]
}
],
max_tokens=300
)
transcribed_text = response.choices[0].message.content
return transcribed_text