R2-Tuning

Sleeping

App Files Files Community

MisbahKhan commited on Apr 14

Commit

30b99b4

verified ·

1 Parent(s): cd4c391

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -119

app.py CHANGED Viewed

@@ -1,21 +1,25 @@
 import random
 from functools import partial
-import gradio as gr
-import torch
 import clip
 import decord
 import nncore
 import numpy as np
-import pandas as pd
 import torchvision.transforms.functional as F
 from decord import VideoReader
 from nncore.engine import load_checkpoint
 from nncore.nn import build_model
-TITLE = '🌀 R2-Tuning: Efficient Image-to-Video Transfer Learning'
 CONFIG = 'configs/qvhighlights/r2_tuning_qvhighlights.py'
 WEIGHT = 'https://huggingface.co/yeliudev/R2-Tuning/resolve/main/checkpoints/r2_tuning_qvhighlights-ed516355.pth'
 EXAMPLES = [
     ('data/gTAvxnQtjXM_60.0_210.0.mp4', 'A man in a white t shirt wearing a backpack is showing a nearby cathedral.'),
     ('data/pA6Z-qYhSNg_210.0_360.0.mp4', 'Different Facebook posts on transgender bathrooms are shown.'),
@@ -23,17 +27,22 @@ EXAMPLES = [
     ('data/ocLUzCNodj4_360.0_510.0.mp4', 'A woman stands in her bedroom in front of a mirror and talks.'),
     ('data/HkLfNhgP0TM_660.0_810.0.mp4', 'Woman lays down on the couch while talking to the camera.')
 ]
 def convert_time(seconds):
     minutes, seconds = divmod(round(max(seconds, 0)), 60)
     return f'{minutes:02d}:{seconds:02d}'
 def load_video(video_path, cfg):
     decord.bridge.set_bridge('torch')
     vr = VideoReader(video_path)
     stride = vr.get_avg_fps() / cfg.data.val.fps
     fm_idx = [min(round(i), len(vr) - 1) for i in np.arange(0, len(vr), stride).tolist()]
     video = vr.get_batch(fm_idx).permute(0, 3, 1, 2).float() / 255
     size = 336 if '336px' in cfg.model.arch else 224
     h, w = video.size(-2), video.size(-1)
     s = min(h, w)
@@ -41,134 +50,81 @@ def load_video(video_path, cfg):
     video = video[..., x:x + s, y:y + s]
     video = F.resize(video, size=(size, size))
     video = F.normalize(video, (0.481, 0.459, 0.408), (0.269, 0.261, 0.276))
-    return video.reshape(video.size(0), -1).unsqueeze(0)
 def init_model(config, checkpoint):
     cfg = nncore.Config.from_file(config)
     cfg.model.init = True
     if checkpoint.startswith('http'):
         checkpoint = nncore.download(checkpoint, out_dir='checkpoints', verbose=False)
     model = build_model(cfg.model, dist=False).eval()
-    return load_checkpoint(model, checkpoint, warning=False), cfg
 def main(video, query, model, cfg):
-    if not video:
-        raise gr.Error("Please upload a video.")
-    if not query:
-        raise gr.Error("Text query cannot be empty.")
     try:
         video = load_video(video, cfg)
-        query = clip.tokenize(query, truncate=True)
-        device = next(model.parameters()).device
-        data = dict(video=video.to(device), query=query.to(device), fps=[cfg.data.val.fps])
-        with torch.inference_mode():
-            pred = model(data)
-        mr = pred['_out']['boundary'][:5].cpu().tolist()
-        mr = [[convert_time(p[0]), convert_time(p[1]), round(p[2], 2)] for p in mr]
-        hd = pred['_out']['saliency'].cpu()
-        hd = ((hd - hd.min()) / (hd.max() - hd.min()) * 0.9 + 0.05).tolist()
-        hd = pd.DataFrame(dict(x=range(0, len(hd) * 2, 2), y=hd))
-        gr.Info("Results generated successfully!")
-        return mr, hd
-    except Exception as e:
-        raise gr.Error(f"Error processing request: {str(e)}")
 model, cfg = init_model(CONFIG, WEIGHT)
 fn = partial(main, model=model, cfg=cfg)
-# Custom CSS
-custom_css = """
-.block { padding: 2rem; }
-.input-card, .output-card {
-    border: 1px solid #E5E7EB;
-    border-radius: 8px;
-    padding: 1rem;
-    background: #FFFFFF;
-    box-shadow: 0 2px 4px rgba(0,0,0,0.05);
-}
-.markdown-guide {
-    background: #F1F5F9;
-    padding: 1rem;
-    border-radius: 8px;
-}
-.video-input {
-    border-radius: 8px;
-    overflow: hidden;
-    border: 1px solid #E5E7EB;
-}
-.button-primary {
-    transition: all 0.2s ease;
-}
-.button-primary:hover {
-    transform: scale(1.05);
-    box-shadow: 0 4px 8px rgba(0,0,0,0.1);
-}
-@media (max-width: 768px) {
-    .block { padding: 1rem; }
-    .input-card, .output-card { padding: 0.5rem; }
-    h1 { font-size: 1.8rem; }
-}
-"""
-# UI
-custom_theme = gr.themes.Base(
-    primary_hue="blue",
-    secondary_hue="gray",
-    neutral_hue="zinc",
-    radius_size="lg",
-    text_size="md",
-    font=["Inter", "Roboto", "sans-serif"],
-)
-TITLE_MD = '<h1 align="center" style="font-size: 2.5rem; font-weight: 700;">🌀 R<sup>2</sup>-Tuning: Image-to-Video Transfer Learning</h1>'
-DESCRIPTION_MD = '''
-<div style="text-align: center; font-size: 1.1rem; color: #4B5EAA;">
-R<sup>2</sup>-Tuning is a parameter-efficient method for video temporal grounding.
-Explore our <a href="https://arxiv.org/abs/2404.00801" style="color: #1D4ED8;">Tech Report</a>
-and <a href="https://github.com/yeliudev/R2-Tuning" style="color: #1D4ED8;">GitHub Repo</a>.
-</div>
-'''
-GUIDE_MD = '''
-### 📋 User Guide
-1. **Upload a video** or click "Random" to try a sample.
-2. **Enter a text query** (5–15 words recommended).
-3. **Click Submit** to view moment retrieval and highlight detection results.
-'''
-with gr.Blocks(title=TITLE, theme=custom_theme, css=custom_css) as demo:
-    gr.Markdown(TITLE_MD, elem_classes="text-center")
-    gr.Markdown(DESCRIPTION_MD, elem_classes="text-center")
-    gr.Markdown(GUIDE_MD, elem_classes="markdown-guide")
-    with gr.Row(variant="panel"):
-        with gr.Column(scale=1, min_width=400):
-            with gr.Group(elem_classes="input-card"):
-                video = gr.Video(label='Upload Video', elem_classes="video-input", height=300)
-                query = gr.Textbox(label='Text Query', placeholder="Enter a descriptive sentence (5-15 words)...")
             with gr.Row():
-                random_btn = gr.Button(value='🔮 Random', variant="secondary")
-                gr.ClearButton([video, query], value='🗑️ Reset', variant="secondary")
-                submit_btn = gr.Button(value='🚀 Submit', variant="primary")
-        with gr.Column(scale=1, min_width=400):
-            with gr.Group(elem_classes="output-card"):
-                mr = gr.DataFrame(
-                    headers=['Start Time', 'End Time', 'Score'],
-                    label='Moment Retrieval',
-                    elem_classes="result-table"
-                )
-                hd = gr.LinePlot(
-                    x='x',
-                    y='y',
-                    x_title='Time (seconds)',
-                    y_title='Saliency Score',
-                    label='Highlight Detection',
-                    color="#4B5EAA",
-                    show_label=True,
-                    height=250,
-                    tooltip=True,
-                    grid=True
-                )
-    random_btn.click(lambda: random.sample(EXAMPLES, 1)[0], None, [video, query])
-    submit_btn.click(fn, [video, query], [mr, hd])
-demo.launch()

 import random
 from functools import partial
 import clip
 import decord
+import gradio as gr
 import nncore
 import numpy as np
+import torch
 import torchvision.transforms.functional as F
 from decord import VideoReader
 from nncore.engine import load_checkpoint
 from nncore.nn import build_model
+import pandas as pd
 CONFIG = 'configs/qvhighlights/r2_tuning_qvhighlights.py'
 WEIGHT = 'https://huggingface.co/yeliudev/R2-Tuning/resolve/main/checkpoints/r2_tuning_qvhighlights-ed516355.pth'
+# yapf:disable
 EXAMPLES = [
     ('data/gTAvxnQtjXM_60.0_210.0.mp4', 'A man in a white t shirt wearing a backpack is showing a nearby cathedral.'),
     ('data/pA6Z-qYhSNg_210.0_360.0.mp4', 'Different Facebook posts on transgender bathrooms are shown.'),
     ('data/ocLUzCNodj4_360.0_510.0.mp4', 'A woman stands in her bedroom in front of a mirror and talks.'),
     ('data/HkLfNhgP0TM_660.0_810.0.mp4', 'Woman lays down on the couch while talking to the camera.')
 ]
+# yapf:enable
 def convert_time(seconds):
     minutes, seconds = divmod(round(max(seconds, 0)), 60)
     return f'{minutes:02d}:{seconds:02d}'
 def load_video(video_path, cfg):
     decord.bridge.set_bridge('torch')
     vr = VideoReader(video_path)
     stride = vr.get_avg_fps() / cfg.data.val.fps
     fm_idx = [min(round(i), len(vr) - 1) for i in np.arange(0, len(vr), stride).tolist()]
     video = vr.get_batch(fm_idx).permute(0, 3, 1, 2).float() / 255
     size = 336 if '336px' in cfg.model.arch else 224
     h, w = video.size(-2), video.size(-1)
     s = min(h, w)
     video = video[..., x:x + s, y:y + s]
     video = F.resize(video, size=(size, size))
     video = F.normalize(video, (0.481, 0.459, 0.408), (0.269, 0.261, 0.276))
+    video = video.reshape(video.size(0), -1).unsqueeze(0)
+    return video
 def init_model(config, checkpoint):
     cfg = nncore.Config.from_file(config)
     cfg.model.init = True
     if checkpoint.startswith('http'):
         checkpoint = nncore.download(checkpoint, out_dir='checkpoints', verbose=False)
     model = build_model(cfg.model, dist=False).eval()
+    model = load_checkpoint(model, checkpoint, warning=False)
+    return model, cfg
 def main(video, query, model, cfg):
+    if len(query) == 0:
+        raise gr.Error('Text query can not be empty.')
     try:
         video = load_video(video, cfg)
+    except Exception:
+        raise gr.Error('Failed to load the video.')
+    query = clip.tokenize(query, truncate=True)
+    device = next(model.parameters()).device
+    data = dict(video=video.to(device), query=query.to(device), fps=[cfg.data.val.fps])
+    with torch.inference_mode():
+        pred = model(data)
+    mr = pred['_out']['boundary'][:5].cpu().tolist()
+    mr = [[convert_time(p[0]), convert_time(p[1]), round(p[2], 2)] for p in mr]
+    hd = pred['_out']['saliency'].cpu()
+    hd = ((hd - hd.min()) / (hd.max() - hd.min()) * 0.9 + 0.05).tolist()
+    hd = pd.DataFrame(dict(x=range(0, len(hd) * 2, 2), y=hd))
+    return mr, hd
 model, cfg = init_model(CONFIG, WEIGHT)
 fn = partial(main, model=model, cfg=cfg)
+with gr.Blocks(title=TITLE) as demo:
+    gr.Markdown(TITLE_MD)
+    gr.Markdown(DESCRIPTION_MD)
+    gr.Markdown(GUIDE_MD)
+    with gr.Row():
+        with gr.Column():
+            video = gr.Video(label='Video')
+            query = gr.Textbox(label='Text Query')
             with gr.Row():
+                random_btn = gr.Button(value='🔮 Random')
+                gr.ClearButton([video, query], value='🗑️ Reset')
+                submit_btn = gr.Button(value='🚀 Submit')
+        with gr.Column():
+            mr = gr.DataFrame(
+                headers=['Start Time', 'End Time', 'Score'], label='Moment Retrieval')
+            hd = gr.LinePlot(
+                x='x',
+                y='y',
+                x_title='Time (seconds)',
+                y_title='Saliency Score',
+                label='Highlight Detection')
+        random_btn.click(lambda: random.sample(EXAMPLES, 1)[0], None, [video, query])
+        submit_btn.click(fn, [video, query], [mr, hd])
+demo.launch()