import os os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' from transformers import VisionEncoderDecoderModel, TrOCRProcessor from PIL import Image import io import base64 # Load model and processor processor = TrOCRProcessor.from_pretrained("anuashok/ocr-captcha-v3", use_fast=True) model = VisionEncoderDecoderModel.from_pretrained( "anuashok/ocr-captcha-v3") def resolve_captcha(image_path): # Check if input is base64 string if isinstance(image_path, str) and image_path.startswith('data:image'): # Extract the base64 data after the comma base64_data = image_path.split(',')[1] # Decode base64 to bytes image_bytes = base64.b64decode(base64_data) # Create PIL Image from bytes image = Image.open(io.BytesIO(image_bytes)).convert("RGBA") else: # Handle as regular file path image = Image.open(image_path).convert("RGBA") background = Image.new("RGBA", image.size, (255, 255, 255)) combined = Image.alpha_composite(background, image).convert("RGB") # Prepare image for the model pixel_values = processor(combined, return_tensors="pt").pixel_values # Generate text generated_ids = model.generate(pixel_values) generated_text = processor.batch_decode( generated_ids, skip_special_tokens=True)[0] return generated_text