Spaces:

Nayera-2025
/

Police-Vision-Translator

Sleeping

App Files Files Community

Nayera-2025 commited on 11 days ago

Commit

03760ea

verified ·

1 Parent(s): 1018175

Create app.py

Browse files

Files changed (1) hide show

app.py +269 -0

app.py ADDED Viewed

	@@ -0,0 +1,269 @@

+# police_vision_translator.py
+import gradio as gr
+from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoProcessor, AutoModelForSpeechSeq2Seq
+from transformers import AutoModelForVision2Seq, ViTImageProcessor
+import torch
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+import os
+import tempfile
+import cv2
+# Initialize models
+print("Loading models...")
+# 1. Vision Document Analysis model
+document_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
+document_model = AutoModelForVision2Seq.from_pretrained("Salesforce/blip-image-captioning-large")
+# 2. OCR for text extraction
+ocr_processor = AutoProcessor.from_pretrained("microsoft/trocr-base-printed")
+ocr_model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/trocr-base-printed")
+# 3. Translation model
+translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
+translator_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
+# 4. Speech recognition
+speech_recognizer = pipeline("automatic-speech-recognition", model="openai/whisper-medium")
+# 5. Text-to-speech
+tts_model = AutoModelForSpeechSeq2Seq.from_pretrained("microsoft/speecht5_tts")
+tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
+print("Models loaded!")
+# Language codes mapping
+LANGUAGE_CODES = {
+    "English": "eng_Latn",
+    "Arabic": "ara_Arab",
+    "Hindi": "hin_Deva",
+    "Urdu": "urd_Arab",
+    "Chinese": "zho_Hans",
+    "Russian": "rus_Cyrl",
+    "French": "fra_Latn",
+    "German": "deu_Latn",
+    "Spanish": "spa_Latn",
+    "Japanese": "jpn_Jpan"
+}
+def detect_document_type(image):
+    """Detect what type of document is in the image"""
+    # Use vision model to get general description
+    inputs = document_processor(images=image, return_tensors="pt")
+    outputs = document_model.generate(**inputs, max_length=50)
+    # Convert output IDs to text
+    description = document_model.tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Simple rule-based classification
+    if "passport" in description.lower():
+        return "Passport"
+    elif "license" in description.lower() or "driving" in description.lower():
+        return "Driver's License"
+    elif "id" in description.lower() or "identity" in description.lower() or "card" in description.lower():
+        return "ID Card"
+    else:
+        return "Unknown Document"
+def extract_text_from_regions(image, regions):
+    """Extract text from specific regions of the document"""
+    results = {}
+    img_array = np.array(image)
+    for field_name, (x1, y1, x2, y2) in regions.items():
+        # Extract region
+        region = img_array[y1:y2, x1:x2]
+        region_pil = Image.fromarray(region)
+        # Process with OCR
+        inputs = ocr_processor(images=region_pil, return_tensors="pt")
+        generated_ids = ocr_model.generate(inputs["pixel_values"])
+        text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        results[field_name] = text
+    return results
+def translate_text(text, source_lang, target_lang):
+    """Translate text between languages"""
+    if not text or text.strip() == "":
+        return ""
+    # Get language codes
+    src_code = LANGUAGE_CODES.get(source_lang, "eng_Latn")
+    tgt_code = LANGUAGE_CODES.get(target_lang, "ara_Arab")
+    # Tokenize
+    inputs = translator_tokenizer(text, return_tensors="pt", padding=True)
+    # Translate
+    translated_tokens = translator_model.generate(
+        **inputs,
+        forced_bos_token_id=translator_tokenizer.lang_code_to_id[tgt_code],
+        max_length=128
+    )
+    # Decode
+    translation = translator_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+    return translation
+def process_document(image, source_language="English", target_language="Arabic"):
+    """Main function to process document images"""
+    # Convert to PIL if it's not already
+    if not isinstance(image, Image.Image):
+        image = Image.fromarray(image)
+    # 1. Detect document type
+    doc_type = detect_document_type(image)
+    # 2. Define regions based on document type (simplified example)
+    # In a real implementation, you would use ML to detect these regions
+    width, height = image.size
+    if doc_type == "Passport":
+        regions = {
+            "Name": (int(width*0.3), int(height*0.2), int(width*0.9), int(height*0.3)),
+            "Date of Birth": (int(width*0.3), int(height*0.35), int(width*0.7), int(height*0.45)),
+            "Passport Number": (int(width*0.3), int(height*0.5), int(width*0.7), int(height*0.6))
+        }
+    elif doc_type == "ID Card":
+        regions = {
+            "Name": (int(width*0.3), int(height*0.15), int(width*0.9), int(height*0.25)),
+            "ID Number": (int(width*0.3), int(height*0.3), int(width*0.7), int(height*0.4)),
+            "Address": (int(width*0.1), int(height*0.5), int(width*0.9), int(height*0.7))
+        }
+    else:  # Driver's License or Unknown
+        regions = {
+            "Name": (int(width*0.3), int(height*0.2), int(width*0.9), int(height*0.3)),
+            "License Number": (int(width*0.3), int(height*0.4), int(width*0.7), int(height*0.5)),
+            "Expiration": (int(width*0.3), int(height*0.6), int(width*0.7), int(height*0.7))
+        }
+    # 3. Extract text from regions
+    extracted_info = extract_text_from_regions(image, regions)
+    # 4. Translate extracted text
+    translated_info = {}
+    for field, text in extracted_info.items():
+        translated_info[field] = translate_text(text, source_language, target_language)
+    # 5. Create annotated image
+    annotated_img = image.copy()
+    draw = ImageDraw.Draw(annotated_img)
+    # Attempt to load a font that supports Arabic
+    try:
+        font = ImageFont.truetype("arial.ttf", 20)  # Fallback to system font
+    except IOError:
+        font = ImageFont.load_default()
+    # Draw boxes and translations
+    for field, (x1, y1, x2, y2) in regions.items():
+        # Draw rectangle around region
+        draw.rectangle([(x1, y1), (x2, y2)], outline="green", width=3)
+        # Draw field name and translated text
+        draw.text((x1, y1-25), field, fill="blue", font=font)
+        draw.text((x1, y2+5), f"{extracted_info[field]} → {translated_info[field]}",
+                 fill="red", font=font)
+    # Return results
+    return {
+        "document_type": doc_type,
+        "annotated_image": annotated_img,
+        "extracted_text": extracted_info,
+        "translated_text": translated_info
+    }
+def transcribe_speech(audio_file, source_language="English"):
+    """Transcribe speech from audio file"""
+    result = speech_recognizer(audio_file, generate_kwargs={"language": source_language.lower()})
+    return result["text"]
+def translate_speech(audio_file, source_language="English", target_language="Arabic"):
+    """Transcribe and translate speech"""
+    # 1. Transcribe speech to text
+    transcription = transcribe_speech(audio_file, source_language)
+    # 2. Translate text
+    translation = translate_text(transcription, source_language, target_language)
+    return {
+        "original_text": transcription,
+        "translated_text": translation
+    }
+# Gradio Interface
+def create_ui():
+    with gr.Blocks(title="Police Vision Translator") as app:
+        gr.Markdown("# Dubai Police Vision Translator System")
+        gr.Markdown("## Translate documents, environmental text, and speech in real-time")
+        with gr.Tab("Document Translation"):
+            with gr.Row():
+                with gr.Column():
+                    doc_input = gr.Image(type="pil", label="Upload Document")
+                    source_lang = gr.Dropdown(choices=list(LANGUAGE_CODES.keys()),
+                                             value="English", label="Source Language")
+                    target_lang = gr.Dropdown(choices=list(LANGUAGE_CODES.keys()),
+                                             value="Arabic", label="Target Language")
+                    process_btn = gr.Button("Process Document")
+                with gr.Column():
+                    doc_output = gr.Image(label="Annotated Document")
+                    doc_type = gr.Textbox(label="Document Type")
+                    extracted_json = gr.JSON(label="Extracted Information")
+                    translated_json = gr.JSON(label="Translated Information")
+            process_btn.click(
+                fn=lambda img, src, tgt: process_document(img, src, tgt),
+                inputs=[doc_input, source_lang, target_lang],
+                outputs=[doc_output, doc_type, extracted_json, translated_json]
+            )
+        with gr.Tab("Speech Translation"):
+            with gr.Row():
+                with gr.Column():
+                    audio_input = gr.Audio(type="filepath", label="Record Speech")
+                    speech_source_lang = gr.Dropdown(choices=list(LANGUAGE_CODES.keys()),
+                                                   value="English", label="Source Language")
+                    speech_target_lang = gr.Dropdown(choices=list(LANGUAGE_CODES.keys()),
+                                                   value="Arabic", label="Target Language")
+                    translate_btn = gr.Button("Translate Speech")
+                with gr.Column():
+                    original_text = gr.Textbox(label="Original Speech")
+                    translated_text = gr.Textbox(label="Translated Text")
+            translate_btn.click(
+                fn=lambda audio, src, tgt: translate_speech(audio, src, tgt),
+                inputs=[audio_input, speech_source_lang, speech_target_lang],
+                outputs=[original_text, translated_text]
+            )
+        with gr.Tab("About"):
+            gr.Markdown("""
+            # Police Vision Translator MVP
+            This system demonstrates AI-powered translation capabilities for law enforcement:
+            - **Document Translation**: Identify and translate key fields in passports, IDs, and licenses
+            - **Speech Translation**: Real-time translation of conversations with civilians
+            ## Technologies Used
+            - Vision Transformers for document analysis
+            - NLLB-200 for translation between 200+ languages
+            - Whisper for multilingual speech recognition
+            - SpeechT5 for text-to-speech synthesis
+            Developed for demonstration at the World AI Expo Dubai.
+            """)
+    return app
+# Launch app
+if __name__ == "__main__":
+    demo = create_ui()
+    demo.launch()