Spaces:

xzerus
/

internvl2.5

Running

App Files Files Community

xzerus commited on Dec 21, 2024

Commit

895c285

verified ·

1 Parent(s): 11bbd27

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -70

app.py CHANGED Viewed

@@ -1,85 +1,104 @@
-import numpy as np
 import torch
 import torchvision.transforms as T
-from decord import VideoReader, cpu
 from PIL import Image
-from torchvision.transforms.functional import InterpolationMode
-from transformers import AutoModel, AutoTokenizer
-from fastapi import FastAPI, UploadFile, File
-from typing import List
-from io import BytesIO
-# FastAPI app initialization
-app = FastAPI()
-# Device Configuration
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
 def build_transform(input_size):
-    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
     transform = T.Compose([
-        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
-        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
         T.ToTensor(),
-        T.Normalize(mean=MEAN, std=STD)
     ])
     return transform
-def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
-    orig_width, orig_height = image.size
-    aspect_ratio = orig_width / orig_height
-    target_ratios = set(
-        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
-        i * j <= max_num and i * j >= min_num)
-    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-    target_width = image_size * target_ratios[0][0]
-    target_height = image_size * target_ratios[0][1]
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(target_ratios[0][0] * target_ratios[0][1]):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size
-        )
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-    return processed_images
-def load_image(image_file: BytesIO, input_size=448, max_num=12):
-    image = Image.open(image_file).convert('RGB')
-    transform = build_transform(input_size=input_size)
-    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
-    pixel_values = [transform(image) for image in images]
-    pixel_values = torch.stack(pixel_values).to(device)
-    return pixel_values
-# Load Model
-path = 'OpenGVLab/InternVL2_5-1B'
 model = AutoModel.from_pretrained(
-    path,
-    low_cpu_mem_usage=True,
-    use_flash_attn=False,
-    trust_remote_code=True
-).eval().to(device)
-tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
-@app.post("/predict")
-async def predict(file: UploadFile = File(...), question: str = "Describe the image"):
-    # Load and preprocess the image
-    file_bytes = BytesIO(await file.read())
-    pixel_values = load_image(file_bytes)
-    # Generate a response
-    generation_config = dict(max_new_tokens=1024, do_sample=True)
-    response, _ = model.chat(tokenizer, pixel_values, question, generation_config)
-    return {"question": question, "response": response}

 import torch
 import torchvision.transforms as T
 from PIL import Image
+from threading import Thread
+from transformers import AutoModel, AutoTokenizer, TextIteratorStreamer
+import gradio as gr
+import logging
+# Setup logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+# ImageNet normalization values
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
 def build_transform(input_size):
+    """
+    Build preprocessing pipeline for images.
+    """
     transform = T.Compose([
+        T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+        T.Resize((input_size, input_size), interpolation=T.InterpolationMode.BICUBIC),
         T.ToTensor(),
+        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
     ])
     return transform
+def preprocess_image(image, input_size=448):
+    """
+    Preprocess the image to the required format.
+    """
+    logging.info("Starting image preprocessing...")
+    transform = build_transform(input_size)
+    tensor_image = transform(image).unsqueeze(0)  # Add batch dimension
+    logging.info(f"Image preprocessed. Shape: {tensor_image.shape}")
+    return tensor_image
+# Load the model and tokenizer
+logging.info("Loading model from Hugging Face Hub...")
+model_path = "OpenGVLab/InternVL2_5-1B"  # Use Hugging Face model path
 model = AutoModel.from_pretrained(
+    model_path,
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True,
+).eval()
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
+# Add the `<image>` token if missing
+if "<image>" not in tokenizer.get_vocab():
+    tokenizer.add_tokens(["<image>"])
+    logging.info("Added `<image>` token to tokenizer vocabulary.")
+    model.resize_token_embeddings(len(tokenizer))  # Resize model embeddings
+assert "<image>" in tokenizer.get_vocab(), "Error: `<image>` token is missing from tokenizer vocabulary."
+def describe_image(image):
+    """
+    Generate a description for the uploaded image with streamed output.
+    """
+    try:
+        logging.info("Processing uploaded image...")
+        pixel_values = preprocess_image(image, input_size=448).to(torch.bfloat16)
+        prompt = "<image>\nExtract text from the image, respond with only the extracted text."
+        logging.info(f"Prompt: {prompt}")
+        # Streamer for live text output
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
+        generation_config = dict(max_new_tokens=512, do_sample=True, streamer=streamer)
+        logging.info("Starting model inference...")
+        thread = Thread(target=model.chat, kwargs=dict(
+            tokenizer=tokenizer, pixel_values=pixel_values, question=prompt,
+            history=None, return_history=False, generation_config=generation_config,
+        ))
+        thread.start()
+        generated_text = ''
+        for new_text in streamer:
+            if new_text == model.conv_template.sep:
+                break
+            generated_text += new_text
+            yield new_text  # Stream each chunk
+        logging.info("Inference complete.")
+    except Exception as e:
+        logging.error(f"Error during processing: {e}")
+        yield f"Error: {e}"
+# Gradio Interface
+logging.info("Setting up Gradio interface...")
+interface = gr.Interface(
+    fn=describe_image,
+    inputs=gr.Image(type="pil"),
+    outputs=gr.Textbox(label="Extracted Text", lines=10, interactive=False),
+    title="Image to Text",
+    description="Upload an image to extract text using the pretrained model.",
+    live=True,  # Enables live streaming output
+)
+if __name__ == "__main__":
+    logging.info("Launching Gradio interface...")
+    interface.launch(server_name="0.0.0.0", server_port=7860)