Nayera-2025 commited on
Commit
e57e063
·
verified ·
1 Parent(s): 964e62b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -31
app.py CHANGED
@@ -1,8 +1,7 @@
1
  # police_vision_translator.py
2
  import gradio as gr
3
  from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoProcessor
4
- from transformers import ViTImageProcessor, VisionEncoderDecoderModel
5
- from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor
6
  import torch
7
  import numpy as np
8
  from PIL import Image, ImageDraw, ImageFont
@@ -13,24 +12,19 @@ import cv2
13
  # Initialize models
14
  print("Loading models...")
15
 
16
- # 1. Vision Document Analysis model
17
- document_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
18
- document_model = VisionEncoderDecoderModel.from_pretrained("Salesforce/blip-image-captioning-large")
19
 
20
- # 2. OCR for text extraction - FIX: Use correct model class for TrOCR
21
- ocr_processor = AutoProcessor.from_pretrained("microsoft/trocr-base-printed")
22
- ocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
23
 
24
  # 3. Translation model
25
  translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
26
  translator_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
27
 
28
- # 4. Speech recognition - Use pipeline which handles model loading correctly
29
- speech_recognizer = pipeline("automatic-speech-recognition", model="openai/whisper-medium")
30
-
31
- # 5. Text-to-speech - Use correct model type
32
- tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
33
- tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
34
 
35
  print("Models loaded!")
36
 
@@ -50,18 +44,10 @@ LANGUAGE_CODES = {
50
 
51
  def detect_document_type(image):
52
  """Detect what type of document is in the image"""
53
- # Use vision model to get general description
54
- inputs = document_processor(images=image, return_tensors="pt")
55
  outputs = document_model.generate(**inputs, max_length=50)
56
-
57
- # Convert output IDs to text
58
- # Use proper tokenizer method depending on model structure
59
- if hasattr(document_model, 'decoder') and hasattr(document_model.decoder, 'tokenizer'):
60
- description = document_model.decoder.tokenizer.decode(outputs[0], skip_special_tokens=True)
61
- else:
62
- # Fallback for models without decoder.tokenizer attribute
63
- tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip-image-captioning-large")
64
- description = tokenizer.decode(outputs[0], skip_special_tokens=True)
65
 
66
  # Simple rule-based classification
67
  if "passport" in description.lower():
@@ -83,10 +69,12 @@ def extract_text_from_regions(image, regions):
83
  region = img_array[y1:y2, x1:x2]
84
  region_pil = Image.fromarray(region)
85
 
86
- # Process with OCR
87
- inputs = ocr_processor(images=region_pil, return_tensors="pt")
88
- generated_ids = ocr_model.generate(inputs["pixel_values"])
89
- text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
 
90
 
91
  results[field_name] = text
92
 
@@ -260,10 +248,10 @@ def create_ui():
260
 
261
  ## Technologies Used
262
 
263
- - Vision Transformers for document analysis
 
264
  - NLLB-200 for translation between 200+ languages
265
  - Whisper for multilingual speech recognition
266
- - SpeechT5 for text-to-speech synthesis
267
 
268
  Developed for demonstration at the World AI Expo Dubai.
269
  """)
 
1
  # police_vision_translator.py
2
  import gradio as gr
3
  from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoProcessor
4
+ from transformers import AutoImageProcessor, AutoModel, BlipProcessor, BlipForConditionalGeneration
 
5
  import torch
6
  import numpy as np
7
  from PIL import Image, ImageDraw, ImageFont
 
12
  # Initialize models
13
  print("Loading models...")
14
 
15
+ # 1. Vision Document Analysis model - Use BLIP directly instead of VisionEncoderDecoderModel
16
+ document_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
17
+ document_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
18
 
19
+ # 2. OCR for text extraction - Use pipeline instead of loading model directly
20
+ ocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-handwritten")
 
21
 
22
  # 3. Translation model
23
  translator_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
24
  translator_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
25
 
26
+ # 4. Speech recognition
27
+ speech_recognizer = pipeline("automatic-speech-recognition", model="openai/whisper-small")
 
 
 
 
28
 
29
  print("Models loaded!")
30
 
 
44
 
45
  def detect_document_type(image):
46
  """Detect what type of document is in the image"""
47
+ # Use BLIP model to get general description
48
+ inputs = document_processor(images=image, text="What type of document is this?", return_tensors="pt")
49
  outputs = document_model.generate(**inputs, max_length=50)
50
+ description = document_processor.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
51
 
52
  # Simple rule-based classification
53
  if "passport" in description.lower():
 
69
  region = img_array[y1:y2, x1:x2]
70
  region_pil = Image.fromarray(region)
71
 
72
+ # Process with OCR pipeline
73
+ result = ocr_pipeline(region_pil)
74
+ if result and len(result) > 0 and "generated_text" in result[0]:
75
+ text = result[0]["generated_text"]
76
+ else:
77
+ text = ""
78
 
79
  results[field_name] = text
80
 
 
248
 
249
  ## Technologies Used
250
 
251
+ - BLIP for document analysis and classification
252
+ - TrOCR for text extraction from documents
253
  - NLLB-200 for translation between 200+ languages
254
  - Whisper for multilingual speech recognition
 
255
 
256
  Developed for demonstration at the World AI Expo Dubai.
257
  """)