import gradio as gr from transformers import AutoModel, AutoTokenizer import torch import torchvision.transforms as T from torchvision.transforms.functional import InterpolationMode from Models.modeling_llavaqw import LlavaQwModel IMAGENET_MEAN = (0.485, 0.456, 0.406) IMAGENET_STD = (0.229, 0.224, 0.225) model_name = "torettomarui/Llava-qw" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False) model = LlavaQwModel.from_pretrained( model_name, torch_dtype=torch.bfloat16, trust_remote_code=True, ).to(torch.bfloat16).eval().cuda() def build_transform(input_size): MEAN, STD = IMAGENET_MEAN, IMAGENET_STD transform = T.Compose([ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=MEAN, std=STD) ]) return transform def preprocess_image(file_path, image_size=448): transform = build_transform(image_size) pixel_values = transform(file_path) return torch.stack([pixel_values]).to(torch.bfloat16).cuda() def generate_response(image, text): pixel_values = preprocess_image(image) generation_config = dict(max_new_tokens=2048, do_sample=False) question = '\n' + text response = model.chat(tokenizer, pixel_values, question, generation_config) return response # 添加示例图像和文本 examples = [ ["./text.png", "图中的文字是什么?"], ] iface = gr.Interface( fn=generate_response, inputs=[ gr.Image(type="pil", label="上传图片"), gr.Textbox(lines=2, placeholder="输入你的问题..."), ], outputs="text", title="Llava-QW", description="上传一张图片并输入你的问题,模型将生成相应的回答。", examples=examples # 添加示例 ) iface.launch()