preston-cell commited on
Commit
68bf04e
·
verified ·
1 Parent(s): 66d96fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -38
app.py CHANGED
@@ -1,77 +1,82 @@
1
  import gradio as gr
2
- import torch
3
- import numpy as np
4
- import requests
5
- import io
6
- from PIL import Image
7
  from transformers import (
8
  pipeline,
9
  AutoProcessor,
10
  AutoModelForCausalLM,
11
  AutoTokenizer,
12
  GenerationConfig,
 
 
13
  )
 
 
 
 
14
  from datasets import load_dataset
15
 
16
- # Set device and dtype
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
- dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 
 
 
 
19
 
20
- # Load BLIP for image captioning
21
- caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
22
 
23
- # Load Florence-2-base for OCR
24
- ocr_model = AutoModelForCausalLM.from_pretrained(
25
- "microsoft/Florence-2-base", trust_remote_code=True, torch_dtype=dtype
26
- ).to(device)
27
  ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
28
 
29
  # Load SmallDoge for context generation
30
  doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
31
  doge_model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True).to(device)
32
-
33
  doge_config = GenerationConfig(
34
  max_new_tokens=100,
 
35
  do_sample=True,
36
  temperature=0.8,
37
  top_p=0.9,
38
- repetition_penalty=1.0,
39
  )
40
 
41
- # Load SpeechT5 for TTS
42
- synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
43
-
44
- # Load speaker embedding from .npy using BytesIO
45
  SPEAKER_EMBEDDING_URL = "https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors/resolve/main/spkemb/fn0012.npy"
46
  response = requests.get(SPEAKER_EMBEDDING_URL)
47
  buffer = io.BytesIO(response.content)
48
- speaker_embedding = torch.tensor(np.load(buffer)).unsqueeze(0) # Shape: [1, 600]
49
-
 
50
 
 
51
  def process_image(image):
52
  try:
53
- # Captioning
54
- caption = caption_model(image)[0]["generated_text"]
55
 
56
- # OCR
57
- inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, dtype)
58
  generated_ids = ocr_model.generate(
59
  input_ids=inputs["input_ids"],
60
  pixel_values=inputs["pixel_values"],
61
- max_new_tokens=1024,
62
- num_beams=3,
63
  do_sample=False,
 
64
  )
65
  extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
66
 
67
- # Context generation with Doge
68
  prompt = f"Determine the context of this image based on the caption and extracted text.\nCaption: {caption}\nExtracted text: {extracted_text}\nContext:"
69
  conversation = [{"role": "user", "content": prompt}]
70
- inputs = doge_tokenizer.apply_chat_template(conversation, tokenize=True, return_tensors="pt").to(device)
71
- output_ids = doge_model.generate(inputs, generation_config=doge_config)
72
- context = doge_tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
 
 
73
 
74
- # TTS
75
  speech = synthesiser(
76
  context,
77
  forward_params={"speaker_embeddings": speaker_embedding}
@@ -84,19 +89,18 @@ def process_image(image):
84
  except Exception as e:
85
  return None, f"Error: {str(e)}", "", ""
86
 
87
-
88
- # Gradio interface
89
  iface = gr.Interface(
90
  fn=process_image,
91
- inputs=gr.Image(type="pil", label="Upload an Image"),
92
  outputs=[
93
  gr.Audio(label="Generated Audio"),
94
  gr.Textbox(label="Generated Caption"),
95
  gr.Textbox(label="Extracted Text (OCR)"),
96
- gr.Textbox(label="Generated Context"),
97
  ],
98
  title="SeeSay Contextualizer",
99
- description="Upload an image to generate a caption, extract text (OCR), generate context using Doge, and convert to audio with SpeechT5.",
100
  )
101
 
102
- iface.launch()
 
1
  import gradio as gr
 
 
 
 
 
2
  from transformers import (
3
  pipeline,
4
  AutoProcessor,
5
  AutoModelForCausalLM,
6
  AutoTokenizer,
7
  GenerationConfig,
8
+ TextStreamer,
9
+ set_seed
10
  )
11
+ import torch
12
+ import numpy as np
13
+ import requests
14
+ import io
15
  from datasets import load_dataset
16
 
17
+ # Set device
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
20
+ set_seed(42)
21
+
22
+ # Image Captioning (BLIP)
23
+ caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base", device=0 if torch.cuda.is_available() else -1)
24
 
25
+ # Text-to-Speech (SpeechT5)
26
+ synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts", device=0 if torch.cuda.is_available() else -1)
27
 
28
+ # OCR using Florence-2-base
29
+ ocr_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
 
 
30
  ocr_processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
31
 
32
  # Load SmallDoge for context generation
33
  doge_tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M-Instruct")
34
  doge_model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-320M-Instruct", trust_remote_code=True).to(device)
 
35
  doge_config = GenerationConfig(
36
  max_new_tokens=100,
37
+ use_cache=True,
38
  do_sample=True,
39
  temperature=0.8,
40
  top_p=0.9,
41
+ repetition_penalty=1.0
42
  )
43
 
44
+ # Load speaker embedding (600-dim)
 
 
 
45
  SPEAKER_EMBEDDING_URL = "https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors/resolve/main/spkemb/fn0012.npy"
46
  response = requests.get(SPEAKER_EMBEDDING_URL)
47
  buffer = io.BytesIO(response.content)
48
+ speaker_embedding = torch.tensor(np.load(buffer, allow_pickle=True)).unsqueeze(0) # Shape: [1, 600]
49
+ if speaker_embedding.shape[1] < 600:
50
+ raise ValueError("No suitable speaker embedding of at least 600 dimensions found.")
51
 
52
+ # Main function
53
  def process_image(image):
54
  try:
55
+ # Generate caption
56
+ caption = caption_model(image)[0]['generated_text']
57
 
58
+ # OCR extraction
59
+ inputs = ocr_processor(text="<OCR>", images=image, return_tensors="pt").to(device, torch_dtype)
60
  generated_ids = ocr_model.generate(
61
  input_ids=inputs["input_ids"],
62
  pixel_values=inputs["pixel_values"],
63
+ max_new_tokens=512,
 
64
  do_sample=False,
65
+ num_beams=3
66
  )
67
  extracted_text = ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
68
 
69
+ # Generate context using SmallDoge
70
  prompt = f"Determine the context of this image based on the caption and extracted text.\nCaption: {caption}\nExtracted text: {extracted_text}\nContext:"
71
  conversation = [{"role": "user", "content": prompt}]
72
+ inputs = doge_tokenizer.apply_chat_template(
73
+ conversation=conversation, tokenize=True, return_tensors="pt"
74
+ ).to(device)
75
+
76
+ outputs = doge_model.generate(inputs, generation_config=doge_config)
77
+ context = doge_tokenizer.decode(outputs[0], skip_special_tokens=True)
78
 
79
+ # Text-to-Speech
80
  speech = synthesiser(
81
  context,
82
  forward_params={"speaker_embeddings": speaker_embedding}
 
89
  except Exception as e:
90
  return None, f"Error: {str(e)}", "", ""
91
 
92
+ # Gradio UI
 
93
  iface = gr.Interface(
94
  fn=process_image,
95
+ inputs=gr.Image(type='pil', label="Upload an Image"),
96
  outputs=[
97
  gr.Audio(label="Generated Audio"),
98
  gr.Textbox(label="Generated Caption"),
99
  gr.Textbox(label="Extracted Text (OCR)"),
100
+ gr.Textbox(label="Generated Context")
101
  ],
102
  title="SeeSay Contextualizer",
103
+ description="Upload an image to generate a caption, extract text with Florence-2-base, contextualize with Doge-320M-Instruct, and hear it with SpeechT5."
104
  )
105
 
106
+ iface.launch(share=True)