preston-cell commited on
Commit
efa273d
·
verified ·
1 Parent(s): 602e80d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -9
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
  from datasets import load_dataset
4
  import torch
5
  import numpy as np
@@ -10,7 +10,13 @@ caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captionin
10
  # Load SpeechT5 model for text-to-speech
11
  synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
12
 
13
- # Load speaker embedding once
 
 
 
 
 
 
14
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
15
  speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
16
 
@@ -26,15 +32,26 @@ def process_image(image):
26
  forward_params={"speaker_embeddings": speaker_embedding}
27
  )
28
 
 
 
 
 
 
 
 
 
 
 
 
29
  # Prepare audio data
30
  audio = np.array(speech["audio"])
31
  rate = speech["sampling_rate"]
32
 
33
- # Return both audio and caption
34
- return (rate, audio), caption
35
 
36
  except Exception as e:
37
- return None, f"Error: {str(e)}"
38
 
39
 
40
  # Gradio Interface
@@ -43,11 +60,11 @@ iface = gr.Interface(
43
  inputs=gr.Image(type='pil', label="Upload an Image"),
44
  outputs=[
45
  gr.Audio(label="Generated Audio"),
46
- gr.Textbox(label="Generated Caption")
 
47
  ],
48
- title="SeeSay",
49
- description="Upload an image to generate a caption and hear it described with SpeechT5's speech synthesis."
50
  )
51
 
52
  iface.launch()
53
-
 
1
  import gradio as gr
2
+ from transformers import pipeline, AutoProcessor, AutoModelForCausalLM
3
  from datasets import load_dataset
4
  import torch
5
  import numpy as np
 
10
  # Load SpeechT5 model for text-to-speech
11
  synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
12
 
13
+ # Load Florence-2 model for OCR
14
+ ocr_device = "cuda:0" if torch.cuda.is_available() else "cpu"
15
+ ocr_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
16
+ ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", torch_dtype=ocr_dtype, trust_remote_code=True).to(ocr_device)
17
+ ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
18
+
19
+ # Load speaker embedding
20
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
21
  speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
22
 
 
32
  forward_params={"speaker_embeddings": speaker_embedding}
33
  )
34
 
35
+ # Extract text (OCR) using Florence-2
36
+ inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(ocr_device, ocr_dtype)
37
+ generated_ids = ocr_model.generate(
38
+ input_ids=inputs["input_ids"],
39
+ pixel_values=inputs["pixel_values"],
40
+ max_new_tokens=4096,
41
+ num_beams=3,
42
+ do_sample=False
43
+ )
44
+ extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
45
+
46
  # Prepare audio data
47
  audio = np.array(speech["audio"])
48
  rate = speech["sampling_rate"]
49
 
50
+ # Return audio, caption, and extracted text
51
+ return (rate, audio), caption, extracted_text
52
 
53
  except Exception as e:
54
+ return None, f"Error: {str(e)}", ""
55
 
56
 
57
  # Gradio Interface
 
60
  inputs=gr.Image(type='pil', label="Upload an Image"),
61
  outputs=[
62
  gr.Audio(label="Generated Audio"),
63
+ gr.Textbox(label="Generated Caption"),
64
+ gr.Textbox(label="Extracted Text (OCR)")
65
  ],
66
+ title="SeeSay with SpeechT5 and Florence-2 OCR",
67
+ description="Upload an image to generate a caption, hear it described with SpeechT5's speech synthesis, and extract text using Florence-2 OCR."
68
  )
69
 
70
  iface.launch()