Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,755 Bytes
772f47b 0071505 77a9339 8666408 772f47b 236a5de c40ca86 0071505 236a5de 0071505 772f47b d965da2 13997eb 236a5de 7297c1f 26e9d4e 0071505 236a5de 0071505 236a5de 0071505 236a5de 0071505 236a5de 0071505 236a5de 0071505 772f47b 236a5de ed33483 236a5de ed33483 236a5de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import gradio as gr
import time
import spaces
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import uuid
import os
import numpy as np
# Load model and processor
# model_name = "NAMAA-Space/Qari-OCR-0.1-VL-2B-Instruct"
model_name = "NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name,
torch_dtype="auto",
device_map="cuda"
)
processor = AutoProcessor.from_pretrained(model_name)
max_tokens = 2000
@spaces.GPU
def perform_ocr(image):
inputArray = np.any(image)
if inputArray == False:
return "Error Processing"
"""Process image and extract text using OCR model"""
image = Image.fromarray(image)
src = str(uuid.uuid4()) + ".png"
prompt = "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate."
image.save(src)
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": f"file://{src}"},
{"type": "text", "text": prompt},
],
}
]
# Process inputs
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
# Generate text
generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, use_cache=True)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
# Cleanup
os.remove(src)
return output_text
# Create Gradio interface
with gr.Blocks(title="Qari Arabic OCR") as demo:
gr.Markdown("# Qari Arabic OCR")
gr.Markdown("Upload an image to extract Arabic text in real-time. This model is specialized for Arabic document OCR.")
with gr.Row():
with gr.Column(scale=1):
# Input image
image_input = gr.Image(type="numpy", label="Upload Image")
# Example gallery
gr.Examples(
examples=[
["2.jpg"],
["3.jpg"]
],
inputs=image_input,
label="Example Images",
examples_per_page=4
)
# Submit button
submit_btn = gr.Button("Extract Text")
with gr.Column(scale=1):
# Output text
output = gr.Textbox(label="Extracted Text", lines=20, show_copy_button=True)
# Model details
with gr.Accordion("Model Information", open=False):
gr.Markdown("""
**Model:** Qari-OCR-0.1-VL-2B-Instruct
**Description:** Arabic OCR model based on Qwen2-VL architecture
**Size:** 2B parameters
**Context window:** Supports up to 2000 output tokens
""")
# Set up processing flow
submit_btn.click(fn=perform_ocr, inputs=image_input, outputs=output)
image_input.change(fn=perform_ocr, inputs=image_input, outputs=output)
demo.launch() |