Spaces:

Nayera-2025
/

Police-Vision-Translator

Sleeping

App Files Files Community

Nayera-2025 commited on 9 days ago

Commit

e57e063

verified ·

1 Parent(s): 964e62b

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -31

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
 # police_vision_translator.py
 import gradio as gr
 from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoProcessor
-from transformers import ViTImageProcessor, VisionEncoderDecoderModel
-from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor
 import torch
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
@@ -13,24 +12,19 @@ import cv2
 # Initialize models
 print("Loading models...")
-# 1. Vision Document Analysis model
-document_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
-document_model = VisionEncoderDecoderModel.from_pretrained("Salesforce/blip-image-captioning-large")
-# 2. OCR for text extraction - FIX: Use correct model class for TrOCR
-ocr_processor = AutoProcessor.from_pretrained("microsoft/trocr-base-printed")
-ocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
 # 3. Translation model
 translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
 translator_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
-# 4. Speech recognition - Use pipeline which handles model loading correctly
-speech_recognizer = pipeline("automatic-speech-recognition", model="openai/whisper-medium")
-# 5. Text-to-speech - Use correct model type
-tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
 print("Models loaded!")
@@ -50,18 +44,10 @@ LANGUAGE_CODES = {
 def detect_document_type(image):
     """Detect what type of document is in the image"""
-    # Use vision model to get general description
-    inputs = document_processor(images=image, return_tensors="pt")
     outputs = document_model.generate(**inputs, max_length=50)
-    # Convert output IDs to text
-    # Use proper tokenizer method depending on model structure
-    if hasattr(document_model, 'decoder') and hasattr(document_model.decoder, 'tokenizer'):
-        description = document_model.decoder.tokenizer.decode(outputs[0], skip_special_tokens=True)
-    else:
-        # Fallback for models without decoder.tokenizer attribute
-        tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip-image-captioning-large")
-        description = tokenizer.decode(outputs[0], skip_special_tokens=True)
     # Simple rule-based classification
     if "passport" in description.lower():
@@ -83,10 +69,12 @@ def extract_text_from_regions(image, regions):
         region = img_array[y1:y2, x1:x2]
         region_pil = Image.fromarray(region)
-        # Process with OCR
-        inputs = ocr_processor(images=region_pil, return_tensors="pt")
-        generated_ids = ocr_model.generate(inputs["pixel_values"])
-        text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         results[field_name] = text
@@ -260,10 +248,10 @@ def create_ui():
             ## Technologies Used
-            - Vision Transformers for document analysis
             - NLLB-200 for translation between 200+ languages
             - Whisper for multilingual speech recognition
-            - SpeechT5 for text-to-speech synthesis
             Developed for demonstration at the World AI Expo Dubai.
             """)

 # police_vision_translator.py
 import gradio as gr
 from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoProcessor
+from transformers import AutoImageProcessor, AutoModel, BlipProcessor, BlipForConditionalGeneration
 import torch
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
 # Initialize models
 print("Loading models...")
+# 1. Vision Document Analysis model - Use BLIP directly instead of VisionEncoderDecoderModel
+document_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+document_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+# 2. OCR for text extraction - Use pipeline instead of loading model directly
+ocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-handwritten")
 # 3. Translation model
 translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
 translator_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
+# 4. Speech recognition
+speech_recognizer = pipeline("automatic-speech-recognition", model="openai/whisper-small")
 print("Models loaded!")
 def detect_document_type(image):
     """Detect what type of document is in the image"""
+    # Use BLIP model to get general description
+    inputs = document_processor(images=image, text="What type of document is this?", return_tensors="pt")
     outputs = document_model.generate(**inputs, max_length=50)
+    description = document_processor.decode(outputs[0], skip_special_tokens=True)
     # Simple rule-based classification
     if "passport" in description.lower():
         region = img_array[y1:y2, x1:x2]
         region_pil = Image.fromarray(region)
+        # Process with OCR pipeline
+        result = ocr_pipeline(region_pil)
+        if result and len(result) > 0 and "generated_text" in result[0]:
+            text = result[0]["generated_text"]
+        else:
+            text = ""
         results[field_name] = text
             ## Technologies Used
+            - BLIP for document analysis and classification
+            - TrOCR for text extraction from documents
             - NLLB-200 for translation between 200+ languages
             - Whisper for multilingual speech recognition
             Developed for demonstration at the World AI Expo Dubai.
             """)