Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,772 Bytes
0c1b8f7 b06a87f 0c1b8f7 fca22b9 0c1b8f7 32d8e74 b06a87f fca22b9 26f7b76 fca22b9 26f7b76 fca22b9 34d2094 fca22b9 d6b5ac6 34d2094 fca22b9 0886910 fca22b9 26f7b76 3a6718d d7f29b6 7f471f2 b06a87f 83a0174 26f7b76 83a0174 48a6837 26f7b76 fca22b9 a592e13 26f7b76 34d2094 26f7b76 d6b5ac6 26f7b76 ea9ba29 fca22b9 0ba4242 fca22b9 26f7b76 ea9ba29 26f7b76 ea9ba29 0ba4242 26f7b76 4bcff80 26f7b76 0ba4242 fca22b9 26f7b76 0ba4242 fca22b9 0ba4242 47473ae 0c1b8f7 2aadb64 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import os
from threading import Thread
import gradio as gr
import spaces
import torch
import edge_tts
import asyncio
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from transformers.image_utils import load_image
from huggingface_hub import InferenceClient
import time
# Load text-only model and tokenizer
model_id = "prithivMLmods/FastThink-0.5B-Tiny"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
)
model.eval()
# Load multimodal (OCR) model and processor
MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model_m = Qwen2VLForConditionalGeneration.from_pretrained(
MODEL_ID,
trust_remote_code=True,
torch_dtype=torch.float16
).to("cuda").eval()
TTS_VOICES = [
"en-US-JennyNeural", # @tts1
"en-US-GuyNeural", # @tts2
]
def image_gen(prompt):
"""Generate image using API"""
try:
client = InferenceClient("prithivMLmods/STABLE-HAMSTER")
return client.text_to_image(prompt)
except:
client_flux = InferenceClient("black-forest-labs/FLUX.1-schnell")
return client_flux.text_to_image(prompt)
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
"""Convert text to speech using Edge TTS and save as MP3"""
communicate = edge_tts.Communicate(text, voice)
await communicate.save(output_file)
return output_file
def clean_chat_history(chat_history):
return [msg for msg in chat_history if isinstance(msg, dict) and isinstance(msg.get("content"), str)]
@spaces.GPU
def generate(input_dict: dict, chat_history: list[dict], max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2):
"""Generates chatbot responses with multimodal input, TTS, and image generation."""
text = input_dict["text"]
files = input_dict.get("files", [])
images = [load_image(file) for file in files] if files else []
if text.startswith("@tts"):
voice_index = next((i for i in range(1, 3) if text.startswith(f"@tts{i}")), None)
if voice_index:
voice = TTS_VOICES[voice_index - 1]
text = text.replace(f"@tts{voice_index}", "").strip()
conversation = [{"role": "user", "content": text}]
else:
voice = None
elif text.startswith("@image"):
query = text.replace("@image", "").strip()
yield "Generating Image, Please wait..."
image = image_gen(query)
yield gr.Image(image)
else:
conversation = clean_chat_history(chat_history) + [{"role": "user", "content": text}]
if images:
messages = [{
"role": "user",
"content": [
*[{"type": "image", "image": img} for img in images],
{"type": "text", "text": text},
]
}]
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
thread = Thread(target=model_m.generate, kwargs={**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens})
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text.replace("<|im_end|>", "")
yield buffer
else:
input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
thread = Thread(target=model.generate, kwargs={
"input_ids": input_ids,
"streamer": streamer,
"max_new_tokens": max_new_tokens,
"do_sample": True,
"top_p": top_p,
"top_k": top_k,
"temperature": temperature,
"num_beams": 1,
"repetition_penalty": repetition_penalty,
})
thread.start()
response = "".join([new_text for new_text in streamer])
yield response
if voice:
output_file = asyncio.run(text_to_speech(response, voice))
yield gr.Audio(output_file, autoplay=True)
demo = gr.ChatInterface(
fn=generate,
additional_inputs=[
gr.Slider(label="Max new tokens", minimum=1, maximum=2048, step=1, value=1024),
gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
],
examples=[
["@tts1 Who is Nikola Tesla?"],
[{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
["@image futuristic city at sunset"],
["A train travels 60 kilometers per hour. How far will it travel in 5 hours?"],
],
cache_examples=False,
description="# QwQ Edge 💬",
fill_height=True,
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
stop_btn="Stop Generation",
multimodal=True,
)
if __name__ == "__main__":
demo.queue(max_size=20).launch(share=True) |