nanoLLaVA

Runtime error

App Files Files Community

qnguyen3 commited on 28 days ago

Commit

0159119

verified ·

1 Parent(s): 50bd3d6

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -131

app.py CHANGED Viewed

@@ -6,17 +6,21 @@ from threading import Thread
 import re
 import time
 from PIL import Image
 import spaces
 import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
-# Initialize tokenizer (doesn't require CUDA)
 tokenizer = AutoTokenizer.from_pretrained(
     'qnguyen3/nanoLLaVA-1.5',
     trust_remote_code=True)
-# Don't initialize model here - move it to the GPU-decorated function
-model = None
 class KeywordsStoppingCriteria(StoppingCriteria):
     def __init__(self, keywords, tokenizer, input_ids):
@@ -55,154 +59,69 @@ class KeywordsStoppingCriteria(StoppingCriteria):
 @spaces.GPU
 def bot_streaming(message, history):
-    global model
-    # Initialize the model inside the GPU-decorated function
-    if model is None:
-        model = LlavaQwen2ForCausalLM.from_pretrained(
-            'qnguyen3/nanoLLaVA-1.5',
-            torch_dtype=torch.float16,
-            attn_implementation="flash_attention_2",
-            trust_remote_code=True,
-            device_map="auto")  # Use "auto" instead of 'cpu' then manual to('cuda')
-    # Get image path
-    image = None
-    if "files" in message and message["files"]:
-        image = message["files"][-1]["path"]
-    # Check if image is available
-    if image is None:
-        return "Please upload an image for LLaVA to work."
-    # Prepare conversation messages
     messages = []
-    if len(history) > 0:
-        for human, assistant in history:
-            # Skip None responses (which can happen during streaming)
-            if assistant is not None:
-                messages.append({"role": "user", "content": human})
-                messages.append({"role": "assistant", "content": assistant})
-        # Add the current message
-        messages.append({"role": "user", "content": f"<image>\n{message['text']}" if len(messages) == 0 else message['text']})
     else:
         messages.append({"role": "user", "content": f"<image>\n{message['text']}"})
-    # Process image
     image = Image.open(image).convert("RGB")
-    # Prepare input for generation
     text = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True)
-    # Handle image embedding in text
-    if '<image>' in text:
-        text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
-        input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
-    else:
-        # If no <image> tag was added (possible in some chat templates), add it manually
-        input_ids = tokenizer(text).input_ids
-        # Find the position to insert the image token
-        # For simplicity, insert after the user message start
-        user_start_pos = 0
-        for i, token in enumerate(input_ids):
-            if tokenizer.decode([token]) == '<|im_start|>user':
-                user_start_pos = i + 2  # +2 to get past the tag
-                break
-        # Insert image token
-        input_ids = input_ids[:user_start_pos] + [-200] + input_ids[user_start_pos:]
-        input_ids = torch.tensor([input_ids], dtype=torch.long)
-    # Prepare stopping criteria
     stop_str = '<|im_end|>'
     keywords = [stop_str]
     stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    # Process image and generate text
     image_tensor = model.process_images([image], model.config).to(dtype=model.dtype)
-    generation_kwargs = dict(
-        input_ids=input_ids,
-        images=image_tensor,
-        streamer=streamer,
-        max_new_tokens=512,
-        stopping_criteria=[stopping_criteria],
-        temperature=0.01
-    )
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
-    # Stream response
     buffer = ""
     for new_text in streamer:
-        buffer += new_text
-        generated_text_without_prompt = buffer[:]
-        time.sleep(0.04)
-        yield generated_text_without_prompt
-# Create a gradio Blocks interface instead of ChatInterface
-# This avoids the schema validation issues
-with gr.Blocks(title="🚀nanoLLaVA-1.5") as demo:
-    gr.Markdown("## 🚀nanoLLaVA-1.5")
-    gr.Markdown("Try [nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA-1.5) in this demo. Built on top of [Quyen-SE-v0.1](https://huggingface.co/vilm/Quyen-SE-v0.1) (Qwen1.5-0.5B) and [Google SigLIP-400M](https://huggingface.co/google/siglip-so400m-patch14-384). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.")
-    chatbot = gr.Chatbot(height=500)
-    with gr.Row():
-        with gr.Column(scale=0.8):
-            msg = gr.Textbox(
-                show_label=False,
-                placeholder="Enter text and upload an image",
-                container=False
-            )
-        with gr.Column(scale=0.2):
-            btn = gr.Button("Submit")
-            stop_btn = gr.Button("Stop Generation")
-    upload_btn = gr.UploadButton("Upload Image", file_types=["image"])
-    current_img = gr.State(None)
-    # Example images
-    examples = gr.Examples(
-        examples=[
-            ["Who is this guy?", "./demo_1.jpg"],
-            ["What does the text say?", "./demo_2.jpeg"]
-        ],
-        inputs=[msg, upload_btn]
-    )
-    def upload_image(image):
-        return image
-    def add_text(history, text, image):
-        if image is None and (not history or type(history[0][0]) != tuple):
-            return history + [[text, "Please upload an image first."]]
-        return history + [[text, None]]
-    def bot_response(history, image):
-        message = {"text": history[-1][0], "files": [{"path": image}] if image else []}
-        history_format = history[:-1]  # All except the last message
-        response = ""
-        for chunk in bot_streaming(message, history_format):
-            response = chunk
-            history[-1][1] = response
-            yield history
-    upload_btn.upload(upload_image, upload_btn, current_img)
-    msg.submit(add_text, [chatbot, msg, current_img], chatbot).then(
-        bot_response, [chatbot, current_img], chatbot
-    )
-    btn.click(add_text, [chatbot, msg, current_img], chatbot).then(
-        bot_response, [chatbot, current_img], chatbot
-    )
-    stop_btn.click(None, None, None, cancels=[bot_response])
-# Launch the app with queuing
 demo.queue().launch()

 import re
 import time
 from PIL import Image
+import torch
 import spaces
 import subprocess
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 tokenizer = AutoTokenizer.from_pretrained(
     'qnguyen3/nanoLLaVA-1.5',
     trust_remote_code=True)
+model = LlavaQwen2ForCausalLM.from_pretrained(
+    'qnguyen3/nanoLLaVA-1.5',
+    torch_dtype=torch.float16,
+    attn_implementation="flash_attention_2",
+    trust_remote_code=True,
+    device_map='auto')
 class KeywordsStoppingCriteria(StoppingCriteria):
     def __init__(self, keywords, tokenizer, input_ids):
 @spaces.GPU
 def bot_streaming(message, history):
     messages = []
+    if message["files"]:
+      image = message["files"][-1]["path"]
     else:
+      for i, hist in enumerate(history):
+        if type(hist[0])==tuple:
+          image = hist[0][0]
+          image_turn = i
+    if len(history) > 0 and image is not None:
+        messages.append({"role": "user", "content": f'<image>\n{history[1][0]}'})
+        messages.append({"role": "assistant", "content": history[1][1] })
+        for human, assistant in history[2:]:
+            messages.append({"role": "user", "content": human })
+            messages.append({"role": "assistant", "content": assistant })
+        messages.append({"role": "user", "content": message['text']})
+    elif len(history) > 0 and image is None:
+        for human, assistant in history:
+            messages.append({"role": "user", "content": human })
+            messages.append({"role": "assistant", "content": assistant })
+        messages.append({"role": "user", "content": message['text']})
+    elif len(history) == 0 and image is not None:
         messages.append({"role": "user", "content": f"<image>\n{message['text']}"})
+    elif len(history) == 0 and image is None:
+        messages.append({"role": "user", "content": message['text'] })
+    model = model.to('cuda')
+    # if image is None:
+    #     gr.Error("You need to upload an image for LLaVA to work.")
     image = Image.open(image).convert("RGB")
     text = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True)
+    text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
+    input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
     stop_str = '<|im_end|>'
     keywords = [stop_str]
     stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     image_tensor = model.process_images([image], model.config).to(dtype=model.dtype)
+    generation_kwargs = dict(input_ids=input_ids.to('cuda'),
+                             images=image_tensor.to('cuda'),
+                             streamer=streamer, max_new_tokens=512,
+                             stopping_criteria=[stopping_criteria], temperature=0.01)
+    generated_text = ""
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
+    text_prompt =f"<|im_start|>user\n{message['text']}<|im_end|>"
     buffer = ""
     for new_text in streamer:
+      buffer += new_text
+      generated_text_without_prompt = buffer[:]
+      time.sleep(0.04)
+      yield generated_text_without_prompt
+demo = gr.ChatInterface(fn=bot_streaming, title="🚀nanoLLaVA-1.5", examples=[{"text": "Who is this guy?", "files":["./demo_1.jpg"]},
+                                                                      {"text": "What does the text say?", "files":["./demo_2.jpeg"]}],
+                        description="Try [nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA-1.5) in this demo. Built on top of [Quyen-SE-v0.1](https://huggingface.co/vilm/Quyen-SE-v0.1) (Qwen1.5-0.5B) and [Google SigLIP-400M](https://huggingface.co/google/siglip-so400m-patch14-384). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.",
+                        stop_btn="Stop Generation", multimodal=True)
 demo.queue().launch()