collarvision / app.py
Kazel's picture
Update app.py
5e6f79d verified
raw
history blame contribute delete
1.85 kB
import gradio as gr
import cv2
import threading
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import spaces
# Initialize the webcam
cap = cv2.VideoCapture(0)
# Load the Hugging Face model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-vqa-base").to("cuda" if torch.cuda.is_available() else "cpu")
@spaces.GPU
def query_the_image(query: str, image_data: bytes):
try:
image = Image.open(io.BytesIO(image_data)).convert("RGB")
inputs = processor(image, query, return_tensors="pt").to(model.device)
output = model.generate(**inputs)
answer = processor.decode(output[0], skip_special_tokens=True)
return answer
except Exception as e:
return f"Error: {e}"
@spaces.GPU
def get_frame():
ret, frame = cap.read()
if not ret:
return None
_, buffer = cv2.imencode('.jpg', frame)
return buffer.tobytes()
@spaces.GPU
def process_image(prompt):
frame_data = get_frame()
if frame_data:
return query_the_image(prompt, frame_data)
return "Error capturing image"
@spaces.GPU
def video_feed():
while True:
ret, frame = cap.read()
if ret:
yield cv2.imencode('.jpg', frame)[1].tobytes()
else:
break
gui = gr.Blocks()
with gui:
gr.Markdown("# Live Video AI Assistant")
with gr.Row():
video_component = gr.Video()
threading.Thread(target=video_feed, daemon=True).start()
prompt = gr.Textbox(label="Enter your safety policy for the AI to analyse each frame in real time")
response = gr.Textbox(label="AI Response")
btn = gr.Button("Ask")
btn.click(process_image, inputs=prompt, outputs=response)
gui.launch()