File size: 3,755 Bytes
772f47b
 
0071505
 
 
 
 
 
77a9339
8666408
772f47b
236a5de
c40ca86
 
0071505
 
 
 
 
 
 
 
236a5de
0071505
772f47b
d965da2
 
13997eb
236a5de
7297c1f
26e9d4e
0071505
 
236a5de
0071505
 
 
 
 
 
 
 
 
236a5de
 
0071505
 
 
 
 
 
 
 
 
 
 
 
236a5de
 
 
0071505
236a5de
0071505
 
 
 
236a5de
 
0071505
 
772f47b
236a5de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed33483
236a5de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed33483
236a5de
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
import time
import spaces
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import uuid
import os
import numpy as np

# Load model and processor
# model_name = "NAMAA-Space/Qari-OCR-0.1-VL-2B-Instruct"
model_name = "NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(
                model_name,
                torch_dtype="auto",
                device_map="cuda"
            )
processor = AutoProcessor.from_pretrained(model_name)
max_tokens = 2000


@spaces.GPU
def perform_ocr(image):
    inputArray = np.any(image)
    if inputArray == False:
        return "Error Processing"
    """Process image and extract text using OCR model"""
    image = Image.fromarray(image)
    src = str(uuid.uuid4()) + ".png"
    prompt = "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate."
    image.save(src)
    
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": f"file://{src}"},
                {"type": "text", "text": prompt},
            ],
        }
    ]
    
    # Process inputs
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")
    
    # Generate text
    generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, use_cache=True)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    
    # Cleanup
    os.remove(src)
    return output_text

# Create Gradio interface
with gr.Blocks(title="Qari Arabic OCR") as demo:
    gr.Markdown("# Qari Arabic OCR")
    gr.Markdown("Upload an image to extract Arabic text in real-time. This model is specialized for Arabic document OCR.")
    
    with gr.Row():
        with gr.Column(scale=1):
            # Input image
            image_input = gr.Image(type="numpy", label="Upload Image")
            
            # Example gallery
            gr.Examples(
                examples=[
                    ["2.jpg"],
                    ["3.jpg"]
                ],
                inputs=image_input,
                label="Example Images",
                examples_per_page=4
            )
            
            # Submit button
            submit_btn = gr.Button("Extract Text")
        
        with gr.Column(scale=1):
            # Output text
            output = gr.Textbox(label="Extracted Text", lines=20, show_copy_button=True)
            
            # Model details
            with gr.Accordion("Model Information", open=False):
                gr.Markdown("""
                **Model:** Qari-OCR-0.1-VL-2B-Instruct
                **Description:** Arabic OCR model based on Qwen2-VL architecture
                **Size:** 2B parameters
                **Context window:** Supports up to 2000 output tokens
                """)
    
    # Set up processing flow
    submit_btn.click(fn=perform_ocr, inputs=image_input, outputs=output)
    image_input.change(fn=perform_ocr, inputs=image_input, outputs=output)

demo.launch()