import gradio as gr from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer from threading import Thread from qwen_vl_utils import process_vision_info import torch import time # Specify the local cache path for models local_path = "Fancy-MLLM/R1-OneVision/R1-OneVision/R1-OneVison-7B" # Load model and processor model = Qwen2_5_VLForConditionalGeneration.from_pretrained( local_path, torch_dtype="auto", device_map="cpu" ) model.cuda().eval() processor = AutoProcessor.from_pretrained(local_path) # Function to process image and text and generate the output def generate_output(image, text, button_click): # Prepare input data messages = [ { "role": "user", "content": [ {"type": "image", "image": image, 'min_pixels': 1003520, 'max_pixels': 12845056}, {"type": "text", "text": text}, ], } ] # Prepare inputs for the model text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # print(text_input) # import pdb; pdb.set_trace() image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text_input], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to("cuda") streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( **inputs, streamer=streamer, max_new_tokens=4096, top_p=0.001, top_k=1, temperature=0.01, repetition_penalty=1.0, ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() generated_text = '' try: for new_text in streamer: generated_text += new_text yield f"‎{generated_text}" # print(f"Current text: {generated_text}") # 调试输出 # yield generated_text # 直接输出原始文本 except Exception as e: print(f"Error: {e}") yield f"Error occurred: {str(e)}" Css = """ #output-markdown { overflow-y: auto; white-space: pre-wrap; word-wrap: break-word; } #output-markdown .math { overflow-x: auto; max-width: 100%; } .markdown-text { white-space: pre-wrap; word-wrap: break-word; } #qwen-md .katex-display { display: inline; } #qwen-md .katex-display>.katex { display: inline; } #qwen-md .katex-display>.katex>.katex-html { display: inline; } """ with gr.Blocks(css=Css) as demo: gr.HTML("""
🦖 R1-Onevision Demo
""") with gr.Row(): with gr.Column(): input_image = gr.Image(type="pil", label="Upload"), input_text = gr.Textbox(label="input your question") with gr.Row(): with gr.Column(): clear_btn = gr.ClearButton([*input_image, input_text]) with gr.Column(): submit_btn = gr.Button("Submit", variant="primary") gr.Examples( examples=[ ["20250208-205626.jpeg", "How many plums (see the picture) weigh as much as an apple?"], ["38.jpg", "Each of the digits 2, 3, 4 and 5 will be placed in a square. Then there will be two numbers, which will be added together. What is the biggest number that they could make?"], ["64.jpg", "Four of the numbers 1,3,4,5 and 7 are written into the boxes so that the calculation is correct.\nWhich number was not used?"], ], inputs=[input_image[0], input_text], label="Example Inputs" ) with gr.Column(): output_text = gr.Markdown( label="Generated Response", max_height="80vh", min_height="50vh", container=True, latex_delimiters=[{ "left": "\\(", "right": "\\)", "display": True }, { "left": "\\begin\{equation\}", "right": "\\end\{equation\}", "display": True }, { "left": "\\begin\{align\}", "right": "\\end\{align\}", "display": True }, { "left": "\\begin\{alignat\}", "right": "\\end\{alignat\}", "display": True }, { "left": "\\begin\{gather\}", "right": "\\end\{gather\}", "display": True }, { "left": "\\begin\{CD\}", "right": "\\end\{CD\}", "display": True }, { "left": "\\[", "right": "\\]", "display": True }], elem_id="qwen-md") submit_btn.click( fn=generate_output, inputs=[*input_image, input_text], outputs=output_text, queue=True ) demo.launch(share=True)