preston-cell commited on
Commit
ffbd81c
·
verified ·
1 Parent(s): 1bef913

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -35
app.py CHANGED
@@ -1,35 +1,38 @@
1
  import gradio as gr
2
  from transformers import (
3
  pipeline,
4
- AutoTokenizer,
5
  AutoModelForCausalLM,
 
6
  GenerationConfig,
7
- set_seed
8
  )
 
9
  import torch
10
  import numpy as np
11
- import pytesseract
12
- from PIL import Image
13
- from datasets import load_dataset
14
-
15
- set_seed(42)
16
 
17
- # Device
18
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
19
 
20
- # Image Captioning (BLIP)
21
  caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
22
 
23
- # Text-to-Speech without speaker embeddings
24
  synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
25
 
26
- # Doge-320M-Instruct for Context Generation
27
- doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
28
- doge_model = AutoModelForCausalLM.from_pretrained(
29
- "SmallDoge/Doge-320M-Instruct", trust_remote_code=True
 
30
  ).to(device)
 
31
 
32
- doge_generation_config = GenerationConfig(
 
 
 
33
  max_new_tokens=100,
34
  use_cache=True,
35
  do_sample=True,
@@ -38,22 +41,35 @@ doge_generation_config = GenerationConfig(
38
  repetition_penalty=1.0
39
  )
40
 
41
- def extract_text_with_tesseract(image):
42
- return pytesseract.image_to_string(image)
 
 
 
 
 
 
 
 
43
 
44
  def process_image(image):
45
  try:
46
- # 1. Caption
47
  caption = caption_model(image)[0]['generated_text']
48
 
49
- # 2. OCR
50
- extracted_text = extract_text_with_tesseract(image)
51
-
52
- # 3. Context with Doge (truncate input)
53
- prompt = (
54
- f"Determine the context of this image.\n"
55
- f"Caption: {caption[:200]}\nExtracted text: {extracted_text[:200]}\nContext:"
 
56
  )
 
 
 
 
57
  conversation = [{"role": "user", "content": prompt}]
58
  doge_inputs = doge_tokenizer.apply_chat_template(
59
  conversation=conversation,
@@ -61,14 +77,17 @@ def process_image(image):
61
  return_tensors="pt"
62
  ).to(device)
63
 
64
- doge_output = doge_model.generate(
65
- input_ids=doge_inputs,
66
- generation_config=doge_generation_config
67
  )
68
- context = doge_tokenizer.decode(doge_output[0], skip_special_tokens=True).strip()
69
 
70
- # 4. Text-to-Speech (no embeddings)
71
- speech = synthesiser(context)
 
 
 
72
  audio = np.array(speech["audio"])
73
  rate = speech["sampling_rate"]
74
 
@@ -77,6 +96,7 @@ def process_image(image):
77
  except Exception as e:
78
  return None, f"Error: {str(e)}", "", ""
79
 
 
80
  iface = gr.Interface(
81
  fn=process_image,
82
  inputs=gr.Image(type='pil', label="Upload an Image"),
@@ -86,8 +106,8 @@ iface = gr.Interface(
86
  gr.Textbox(label="Extracted Text (OCR)"),
87
  gr.Textbox(label="Generated Context")
88
  ],
89
- title="SeeSay Contextualizer (Optimized)",
90
- description="Upload an image to generate a caption, extract text (OCR), generate context, and hear it spoken."
91
  )
92
 
93
- iface.launch(share=True)
 
1
  import gradio as gr
2
  from transformers import (
3
  pipeline,
4
+ AutoProcessor,
5
  AutoModelForCausalLM,
6
+ AutoTokenizer,
7
  GenerationConfig,
8
+ TextStreamer
9
  )
10
+ from datasets import load_dataset
11
  import torch
12
  import numpy as np
 
 
 
 
 
13
 
14
+ # Set device and dtype
15
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
16
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
17
 
18
+ # Image Captioning
19
  caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
20
 
21
+ # Text-to-Speech
22
  synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
23
 
24
+ # Florence-2-base for OCR
25
+ ocr_model = AutoModelForCausalLM.from_pretrained(
26
+ "microsoft/Florence-2-base",
27
+ torch_dtype=torch_dtype,
28
+ trust_remote_code=True
29
  ).to(device)
30
+ ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
31
 
32
+ # Doge model for context generation
33
+ doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
34
+ doge_model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True).to(device)
35
+ doge_config = GenerationConfig(
36
  max_new_tokens=100,
37
  use_cache=True,
38
  do_sample=True,
 
41
  repetition_penalty=1.0
42
  )
43
 
44
+ # Speaker embedding (600-dim)
45
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
46
+ embedding = None
47
+ for entry in embeddings_dataset:
48
+ vector = torch.tensor(entry["xvector"]).unsqueeze(0)
49
+ if vector.shape[1] >= 600:
50
+ embedding = vector[:, :600]
51
+ break
52
+ if embedding is None:
53
+ raise ValueError("No suitable speaker embedding of at least 600 dimensions found.")
54
 
55
  def process_image(image):
56
  try:
57
+ # Caption
58
  caption = caption_model(image)[0]['generated_text']
59
 
60
+ # OCR
61
+ ocr_inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, torch_dtype)
62
+ generated_ids = ocr_model.generate(
63
+ input_ids=ocr_inputs["input_ids"],
64
+ pixel_values=ocr_inputs["pixel_values"],
65
+ max_new_tokens=1024,
66
+ num_beams=3,
67
+ do_sample=False
68
  )
69
+ extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
70
+
71
+ # Doge context generation
72
+ prompt = f"Determine the context of this image. Caption: {caption} Extracted text: {extracted_text}"
73
  conversation = [{"role": "user", "content": prompt}]
74
  doge_inputs = doge_tokenizer.apply_chat_template(
75
  conversation=conversation,
 
77
  return_tensors="pt"
78
  ).to(device)
79
 
80
+ outputs = doge_model.generate(
81
+ doge_inputs,
82
+ generation_config=doge_config
83
  )
84
+ context = doge_tokenizer.decode(outputs[0], skip_special_tokens=True)
85
 
86
+ # TTS
87
+ speech = synthesiser(
88
+ context,
89
+ forward_params={"speaker_embeddings": embedding}
90
+ )
91
  audio = np.array(speech["audio"])
92
  rate = speech["sampling_rate"]
93
 
 
96
  except Exception as e:
97
  return None, f"Error: {str(e)}", "", ""
98
 
99
+ # Gradio Interface
100
  iface = gr.Interface(
101
  fn=process_image,
102
  inputs=gr.Image(type='pil', label="Upload an Image"),
 
106
  gr.Textbox(label="Extracted Text (OCR)"),
107
  gr.Textbox(label="Generated Context")
108
  ],
109
+ title="SeeSay Contextualizer with Doge & BLIP",
110
+ description="Upload an image to generate a caption, extract text, determine context, and convert it to audio."
111
  )
112
 
113
+ iface.launch()