Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
# police_vision_translator.py
|
2 |
import gradio as gr
|
3 |
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoProcessor
|
4 |
-
from transformers import
|
5 |
-
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor
|
6 |
import torch
|
7 |
import numpy as np
|
8 |
from PIL import Image, ImageDraw, ImageFont
|
@@ -13,24 +12,19 @@ import cv2
|
|
13 |
# Initialize models
|
14 |
print("Loading models...")
|
15 |
|
16 |
-
# 1. Vision Document Analysis model
|
17 |
-
document_processor =
|
18 |
-
document_model =
|
19 |
|
20 |
-
# 2. OCR for text extraction -
|
21 |
-
|
22 |
-
ocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
|
23 |
|
24 |
# 3. Translation model
|
25 |
translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
|
26 |
translator_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
|
27 |
|
28 |
-
# 4. Speech recognition
|
29 |
-
speech_recognizer = pipeline("automatic-speech-recognition", model="openai/whisper-
|
30 |
-
|
31 |
-
# 5. Text-to-speech - Use correct model type
|
32 |
-
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
33 |
-
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
34 |
|
35 |
print("Models loaded!")
|
36 |
|
@@ -50,18 +44,10 @@ LANGUAGE_CODES = {
|
|
50 |
|
51 |
def detect_document_type(image):
|
52 |
"""Detect what type of document is in the image"""
|
53 |
-
# Use
|
54 |
-
inputs = document_processor(images=image, return_tensors="pt")
|
55 |
outputs = document_model.generate(**inputs, max_length=50)
|
56 |
-
|
57 |
-
# Convert output IDs to text
|
58 |
-
# Use proper tokenizer method depending on model structure
|
59 |
-
if hasattr(document_model, 'decoder') and hasattr(document_model.decoder, 'tokenizer'):
|
60 |
-
description = document_model.decoder.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
61 |
-
else:
|
62 |
-
# Fallback for models without decoder.tokenizer attribute
|
63 |
-
tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip-image-captioning-large")
|
64 |
-
description = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
65 |
|
66 |
# Simple rule-based classification
|
67 |
if "passport" in description.lower():
|
@@ -83,10 +69,12 @@ def extract_text_from_regions(image, regions):
|
|
83 |
region = img_array[y1:y2, x1:x2]
|
84 |
region_pil = Image.fromarray(region)
|
85 |
|
86 |
-
# Process with OCR
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
90 |
|
91 |
results[field_name] = text
|
92 |
|
@@ -260,10 +248,10 @@ def create_ui():
|
|
260 |
|
261 |
## Technologies Used
|
262 |
|
263 |
-
-
|
|
|
264 |
- NLLB-200 for translation between 200+ languages
|
265 |
- Whisper for multilingual speech recognition
|
266 |
-
- SpeechT5 for text-to-speech synthesis
|
267 |
|
268 |
Developed for demonstration at the World AI Expo Dubai.
|
269 |
""")
|
|
|
1 |
# police_vision_translator.py
|
2 |
import gradio as gr
|
3 |
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoProcessor
|
4 |
+
from transformers import AutoImageProcessor, AutoModel, BlipProcessor, BlipForConditionalGeneration
|
|
|
5 |
import torch
|
6 |
import numpy as np
|
7 |
from PIL import Image, ImageDraw, ImageFont
|
|
|
12 |
# Initialize models
|
13 |
print("Loading models...")
|
14 |
|
15 |
+
# 1. Vision Document Analysis model - Use BLIP directly instead of VisionEncoderDecoderModel
|
16 |
+
document_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
17 |
+
document_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
|
18 |
|
19 |
+
# 2. OCR for text extraction - Use pipeline instead of loading model directly
|
20 |
+
ocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-handwritten")
|
|
|
21 |
|
22 |
# 3. Translation model
|
23 |
translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
|
24 |
translator_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
|
25 |
|
26 |
+
# 4. Speech recognition
|
27 |
+
speech_recognizer = pipeline("automatic-speech-recognition", model="openai/whisper-small")
|
|
|
|
|
|
|
|
|
28 |
|
29 |
print("Models loaded!")
|
30 |
|
|
|
44 |
|
45 |
def detect_document_type(image):
|
46 |
"""Detect what type of document is in the image"""
|
47 |
+
# Use BLIP model to get general description
|
48 |
+
inputs = document_processor(images=image, text="What type of document is this?", return_tensors="pt")
|
49 |
outputs = document_model.generate(**inputs, max_length=50)
|
50 |
+
description = document_processor.decode(outputs[0], skip_special_tokens=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
# Simple rule-based classification
|
53 |
if "passport" in description.lower():
|
|
|
69 |
region = img_array[y1:y2, x1:x2]
|
70 |
region_pil = Image.fromarray(region)
|
71 |
|
72 |
+
# Process with OCR pipeline
|
73 |
+
result = ocr_pipeline(region_pil)
|
74 |
+
if result and len(result) > 0 and "generated_text" in result[0]:
|
75 |
+
text = result[0]["generated_text"]
|
76 |
+
else:
|
77 |
+
text = ""
|
78 |
|
79 |
results[field_name] = text
|
80 |
|
|
|
248 |
|
249 |
## Technologies Used
|
250 |
|
251 |
+
- BLIP for document analysis and classification
|
252 |
+
- TrOCR for text extraction from documents
|
253 |
- NLLB-200 for translation between 200+ languages
|
254 |
- Whisper for multilingual speech recognition
|
|
|
255 |
|
256 |
Developed for demonstration at the World AI Expo Dubai.
|
257 |
""")
|