Nayera-2025 commited on
Commit
03760ea
·
verified ·
1 Parent(s): 1018175

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +269 -0
app.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # police_vision_translator.py
2
+ import gradio as gr
3
+ from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoProcessor, AutoModelForSpeechSeq2Seq
4
+ from transformers import AutoModelForVision2Seq, ViTImageProcessor
5
+ import torch
6
+ import numpy as np
7
+ from PIL import Image, ImageDraw, ImageFont
8
+ import os
9
+ import tempfile
10
+ import cv2
11
+
12
+ # Initialize models
13
+ print("Loading models...")
14
+
15
+ # 1. Vision Document Analysis model
16
+ document_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
17
+ document_model = AutoModelForVision2Seq.from_pretrained("Salesforce/blip-image-captioning-large")
18
+
19
+ # 2. OCR for text extraction
20
+ ocr_processor = AutoProcessor.from_pretrained("microsoft/trocr-base-printed")
21
+ ocr_model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/trocr-base-printed")
22
+
23
+ # 3. Translation model
24
+ translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
25
+ translator_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
26
+
27
+ # 4. Speech recognition
28
+ speech_recognizer = pipeline("automatic-speech-recognition", model="openai/whisper-medium")
29
+
30
+ # 5. Text-to-speech
31
+ tts_model = AutoModelForSpeechSeq2Seq.from_pretrained("microsoft/speecht5_tts")
32
+ tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
33
+
34
+ print("Models loaded!")
35
+
36
+ # Language codes mapping
37
+ LANGUAGE_CODES = {
38
+ "English": "eng_Latn",
39
+ "Arabic": "ara_Arab",
40
+ "Hindi": "hin_Deva",
41
+ "Urdu": "urd_Arab",
42
+ "Chinese": "zho_Hans",
43
+ "Russian": "rus_Cyrl",
44
+ "French": "fra_Latn",
45
+ "German": "deu_Latn",
46
+ "Spanish": "spa_Latn",
47
+ "Japanese": "jpn_Jpan"
48
+ }
49
+
50
+ def detect_document_type(image):
51
+ """Detect what type of document is in the image"""
52
+ # Use vision model to get general description
53
+ inputs = document_processor(images=image, return_tensors="pt")
54
+ outputs = document_model.generate(**inputs, max_length=50)
55
+
56
+ # Convert output IDs to text
57
+ description = document_model.tokenizer.decode(outputs[0], skip_special_tokens=True)
58
+
59
+ # Simple rule-based classification
60
+ if "passport" in description.lower():
61
+ return "Passport"
62
+ elif "license" in description.lower() or "driving" in description.lower():
63
+ return "Driver's License"
64
+ elif "id" in description.lower() or "identity" in description.lower() or "card" in description.lower():
65
+ return "ID Card"
66
+ else:
67
+ return "Unknown Document"
68
+
69
+ def extract_text_from_regions(image, regions):
70
+ """Extract text from specific regions of the document"""
71
+ results = {}
72
+ img_array = np.array(image)
73
+
74
+ for field_name, (x1, y1, x2, y2) in regions.items():
75
+ # Extract region
76
+ region = img_array[y1:y2, x1:x2]
77
+ region_pil = Image.fromarray(region)
78
+
79
+ # Process with OCR
80
+ inputs = ocr_processor(images=region_pil, return_tensors="pt")
81
+ generated_ids = ocr_model.generate(inputs["pixel_values"])
82
+ text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
83
+
84
+ results[field_name] = text
85
+
86
+ return results
87
+
88
+ def translate_text(text, source_lang, target_lang):
89
+ """Translate text between languages"""
90
+ if not text or text.strip() == "":
91
+ return ""
92
+
93
+ # Get language codes
94
+ src_code = LANGUAGE_CODES.get(source_lang, "eng_Latn")
95
+ tgt_code = LANGUAGE_CODES.get(target_lang, "ara_Arab")
96
+
97
+ # Tokenize
98
+ inputs = translator_tokenizer(text, return_tensors="pt", padding=True)
99
+
100
+ # Translate
101
+ translated_tokens = translator_model.generate(
102
+ **inputs,
103
+ forced_bos_token_id=translator_tokenizer.lang_code_to_id[tgt_code],
104
+ max_length=128
105
+ )
106
+
107
+ # Decode
108
+ translation = translator_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
109
+ return translation
110
+
111
+ def process_document(image, source_language="English", target_language="Arabic"):
112
+ """Main function to process document images"""
113
+ # Convert to PIL if it's not already
114
+ if not isinstance(image, Image.Image):
115
+ image = Image.fromarray(image)
116
+
117
+ # 1. Detect document type
118
+ doc_type = detect_document_type(image)
119
+
120
+ # 2. Define regions based on document type (simplified example)
121
+ # In a real implementation, you would use ML to detect these regions
122
+ width, height = image.size
123
+
124
+ if doc_type == "Passport":
125
+ regions = {
126
+ "Name": (int(width*0.3), int(height*0.2), int(width*0.9), int(height*0.3)),
127
+ "Date of Birth": (int(width*0.3), int(height*0.35), int(width*0.7), int(height*0.45)),
128
+ "Passport Number": (int(width*0.3), int(height*0.5), int(width*0.7), int(height*0.6))
129
+ }
130
+ elif doc_type == "ID Card":
131
+ regions = {
132
+ "Name": (int(width*0.3), int(height*0.15), int(width*0.9), int(height*0.25)),
133
+ "ID Number": (int(width*0.3), int(height*0.3), int(width*0.7), int(height*0.4)),
134
+ "Address": (int(width*0.1), int(height*0.5), int(width*0.9), int(height*0.7))
135
+ }
136
+ else: # Driver's License or Unknown
137
+ regions = {
138
+ "Name": (int(width*0.3), int(height*0.2), int(width*0.9), int(height*0.3)),
139
+ "License Number": (int(width*0.3), int(height*0.4), int(width*0.7), int(height*0.5)),
140
+ "Expiration": (int(width*0.3), int(height*0.6), int(width*0.7), int(height*0.7))
141
+ }
142
+
143
+ # 3. Extract text from regions
144
+ extracted_info = extract_text_from_regions(image, regions)
145
+
146
+ # 4. Translate extracted text
147
+ translated_info = {}
148
+ for field, text in extracted_info.items():
149
+ translated_info[field] = translate_text(text, source_language, target_language)
150
+
151
+ # 5. Create annotated image
152
+ annotated_img = image.copy()
153
+ draw = ImageDraw.Draw(annotated_img)
154
+
155
+ # Attempt to load a font that supports Arabic
156
+ try:
157
+ font = ImageFont.truetype("arial.ttf", 20) # Fallback to system font
158
+ except IOError:
159
+ font = ImageFont.load_default()
160
+
161
+ # Draw boxes and translations
162
+ for field, (x1, y1, x2, y2) in regions.items():
163
+ # Draw rectangle around region
164
+ draw.rectangle([(x1, y1), (x2, y2)], outline="green", width=3)
165
+
166
+ # Draw field name and translated text
167
+ draw.text((x1, y1-25), field, fill="blue", font=font)
168
+ draw.text((x1, y2+5), f"{extracted_info[field]} → {translated_info[field]}",
169
+ fill="red", font=font)
170
+
171
+ # Return results
172
+ return {
173
+ "document_type": doc_type,
174
+ "annotated_image": annotated_img,
175
+ "extracted_text": extracted_info,
176
+ "translated_text": translated_info
177
+ }
178
+
179
+ def transcribe_speech(audio_file, source_language="English"):
180
+ """Transcribe speech from audio file"""
181
+ result = speech_recognizer(audio_file, generate_kwargs={"language": source_language.lower()})
182
+ return result["text"]
183
+
184
+ def translate_speech(audio_file, source_language="English", target_language="Arabic"):
185
+ """Transcribe and translate speech"""
186
+ # 1. Transcribe speech to text
187
+ transcription = transcribe_speech(audio_file, source_language)
188
+
189
+ # 2. Translate text
190
+ translation = translate_text(transcription, source_language, target_language)
191
+
192
+ return {
193
+ "original_text": transcription,
194
+ "translated_text": translation
195
+ }
196
+
197
+ # Gradio Interface
198
+ def create_ui():
199
+ with gr.Blocks(title="Police Vision Translator") as app:
200
+ gr.Markdown("# Dubai Police Vision Translator System")
201
+ gr.Markdown("## Translate documents, environmental text, and speech in real-time")
202
+
203
+ with gr.Tab("Document Translation"):
204
+ with gr.Row():
205
+ with gr.Column():
206
+ doc_input = gr.Image(type="pil", label="Upload Document")
207
+ source_lang = gr.Dropdown(choices=list(LANGUAGE_CODES.keys()),
208
+ value="English", label="Source Language")
209
+ target_lang = gr.Dropdown(choices=list(LANGUAGE_CODES.keys()),
210
+ value="Arabic", label="Target Language")
211
+ process_btn = gr.Button("Process Document")
212
+
213
+ with gr.Column():
214
+ doc_output = gr.Image(label="Annotated Document")
215
+ doc_type = gr.Textbox(label="Document Type")
216
+ extracted_json = gr.JSON(label="Extracted Information")
217
+ translated_json = gr.JSON(label="Translated Information")
218
+
219
+ process_btn.click(
220
+ fn=lambda img, src, tgt: process_document(img, src, tgt),
221
+ inputs=[doc_input, source_lang, target_lang],
222
+ outputs=[doc_output, doc_type, extracted_json, translated_json]
223
+ )
224
+
225
+ with gr.Tab("Speech Translation"):
226
+ with gr.Row():
227
+ with gr.Column():
228
+ audio_input = gr.Audio(type="filepath", label="Record Speech")
229
+ speech_source_lang = gr.Dropdown(choices=list(LANGUAGE_CODES.keys()),
230
+ value="English", label="Source Language")
231
+ speech_target_lang = gr.Dropdown(choices=list(LANGUAGE_CODES.keys()),
232
+ value="Arabic", label="Target Language")
233
+ translate_btn = gr.Button("Translate Speech")
234
+
235
+ with gr.Column():
236
+ original_text = gr.Textbox(label="Original Speech")
237
+ translated_text = gr.Textbox(label="Translated Text")
238
+
239
+ translate_btn.click(
240
+ fn=lambda audio, src, tgt: translate_speech(audio, src, tgt),
241
+ inputs=[audio_input, speech_source_lang, speech_target_lang],
242
+ outputs=[original_text, translated_text]
243
+ )
244
+
245
+ with gr.Tab("About"):
246
+ gr.Markdown("""
247
+ # Police Vision Translator MVP
248
+
249
+ This system demonstrates AI-powered translation capabilities for law enforcement:
250
+
251
+ - **Document Translation**: Identify and translate key fields in passports, IDs, and licenses
252
+ - **Speech Translation**: Real-time translation of conversations with civilians
253
+
254
+ ## Technologies Used
255
+
256
+ - Vision Transformers for document analysis
257
+ - NLLB-200 for translation between 200+ languages
258
+ - Whisper for multilingual speech recognition
259
+ - SpeechT5 for text-to-speech synthesis
260
+
261
+ Developed for demonstration at the World AI Expo Dubai.
262
+ """)
263
+
264
+ return app
265
+
266
+ # Launch app
267
+ if __name__ == "__main__":
268
+ demo = create_ui()
269
+ demo.launch()