File size: 8,313 Bytes
f269277
 
 
 
 
 
ef0c954
 
f269277
 
 
 
5822c0a
f269277
 
 
 
 
 
 
e58376a
f269277
 
 
 
5822c0a
f269277
e58376a
f269277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef0c954
 
 
 
 
 
 
 
 
 
 
f53a21b
ef0c954
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f269277
 
 
 
 
 
ef0c954
 
 
 
 
 
 
 
 
 
 
eaf3a91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef0c954
 
 
 
 
0378cfc
ef0c954
 
 
 
 
 
 
 
 
 
 
f53a21b
ef0c954
 
 
 
 
 
 
 
 
 
 
 
f53a21b
ef0c954
 
 
f53a21b
 
 
 
 
 
ef0c954
 
 
 
 
 
e58376a
ef0c954
 
 
 
 
 
 
 
 
 
 
f269277
f53a21b
 
f269277
ef0c954
f269277
 
 
 
ef0c954
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import pickle
import warnings
from pathlib import Path

import gradio as gr

from neus_v.puls.prompts import Mode
from neus_v.puls.puls import PULS
from neus_v.smooth_scoring import smooth_confidence_scores
from neus_v.utils import clear_gpu_memory
from neus_v.veval.eval import evaluate_video_with_sequence_of_images
from neus_v.veval.parse import parse_proposition_set, parse_tl_specification
from neus_v.vlm.vllm_client import VLLMClient

# Suppress specific warnings
warnings.filterwarnings(
    "ignore", category=DeprecationWarning, message="Conversion of an array with ndim > 0 to a scalar is deprecated"
)

# Paths and parameters
WEIGHT_PATH = Path("./assets/")
pickle_path = WEIGHT_PATH / "distributions.pkl"
num_of_frame_in_sequence = 3
model = "InternVL2-8B"
# Load the vision-language model
vision_language_model = VLLMClient(api_base="http://localhost:8000/v1", model="OpenGVLab/InternVL2_5-8B")
# Load distributions
print(f"Loading distributions from {pickle_path}")
with open(pickle_path, "rb") as f:
    distributions = pickle.load(f)
all_dimension_data = distributions.get(model).get("all_dimension")


# TODO: Make paths better for public release
def process_video(video_path, propositions, tl):
    """Process the video and compute the score_on_all."""
    proposition_set = parse_proposition_set(propositions.split(","))
    tl_spec = parse_tl_specification(tl)
    threshold = 0.349

    try:
        result = evaluate_video_with_sequence_of_images(
            vision_language_model=vision_language_model,
            confidence_as_token_probability=True,
            video_path=video_path,
            proposition_set=proposition_set,
            tl_spec=tl_spec,
            parallel_inference=False,
            num_of_frame_in_sequence=num_of_frame_in_sequence,
            threshold=threshold,
        )
        probability = result.get("probability")
        score_on_all = float(
            smooth_confidence_scores(
                target_data=[probability],
                prior_distribution=all_dimension_data,
            )
        )
        clear_gpu_memory()
        return score_on_all

    except Exception as e:
        clear_gpu_memory()
        return f"Error: {str(e)}"


def generate_from_puls(prompt, mode_choice):
    """Generate propositions and TL spec from a natural language prompt using PULS."""
    # Check if prompt is blank
    if not prompt or prompt.strip() == "":
        raise gr.Error("Please enter a text prompt first", duration=5)
        # return "Error: Please enter a text prompt first", "Error: Please enter a text prompt first"

    mode_map = {
        "Object-Action Alignment": Mode.OBJECT_ACTION_ALIGNMENT,
        "Overall Consistency (default)": Mode.OVERALL_CONSISTENCY,
        "Object Existence": Mode.OBJECT_EXISTENCE,
        "Spatial Relationship": Mode.SPATIAL_RELATIONSHIP,
    }

    selected_mode = mode_map[mode_choice]
    result = PULS(prompt, [selected_mode])

    # Extract the relevant propositions and spec based on the selected mode
    mode_key = selected_mode.name.lower().replace("_", " ")
    if mode_key == "object action alignment":
        mode_key = "object_action_alignment"
    elif mode_key == "overall consistency":
        mode_key = "overall_consistency"
    elif mode_key == "object existence":
        mode_key = "object_existence"
    elif mode_key == "spatial relationship":
        mode_key = "spatial_relationships"

    propositions = result.get(mode_key, [])
    spec = result.get(f"{mode_key}_spec", "")

    return ", ".join(propositions), spec


# Gradio interface
def demo_interface(video, propositions, tl):
    """Wrapper for the Gradio interface."""
    return process_video(video, propositions, tl)


# Example data from the original script
example_video_path_1 = (
    "assets/A_storm_bursts_in_with_intermittent_lightning_and_causes_flooding_and_large_waves_crash_in.mp4"
)
example_video_path_2 = "assets/The ocean waves gently lapping at the shore, until a storm bursts in, and then lightning flashes across the sky.mp4"
example_propositions = "waves lapping,ocean shore,storm bursts in,lightning on the sky"
example_tl = '("waves_lapping" & "ocean_shore") U ("storm_bursts_in" U "lightning_on_the_sky")'
example_prompt = (
    "The ocean waves gently lapping at the shore, until a storm bursts in, and then lightning flashes across the sky"
)

with gr.Blocks(title="NeuS-V: Neuro-Symbolic Evaluation of Text-to-Video Models") as demo:
    gr.HTML(
        """
    <div style="text-align: center; margin-bottom: 20px;">
        <h1>NeuS-V: Neuro-Symbolic Evaluation of Text-to-Video Models using Formal Verification πŸ€—</h1>
    </div>
    <div style="text-align: center; margin-bottom: 20px;">
        <p>
            <a href="https://arxiv.org/abs/2403.12345">πŸ“œ Paper</a> | 
            <a href="https://github.com/UTAustin-SwarmLab/NeuS-V">πŸ’» GitHub</a> | 
            <a href="https://ut-austin-swarmlab.github.io/NeuS-V/">🌐 Project Page</a>
        </p>
    </div>
    <div style="text-align: center; font-size: 15px; font-weight: bold; color: red; margin-bottom: 20px;">
        ⚠️ This demo is for academic research and experiential use only. 
    </div>

    """
    )

    with gr.Row():
        with gr.Column():
            video_input = gr.Video(label="Upload Video")
            prompt_input = gr.Textbox(
                label="Text-to-Video Prompt",
                placeholder="The prompt used to generate the video...",
            )
            gr.Markdown(
                "You can either manually enter propositions and temporal logic specification below, or use the button below to automatically generate them from your text prompt using Prompt Understanding via Temporal Logic Specification (PULS). Please refer to our paper for more details."
            )

            with gr.Row():
                use_puls_checkbox = gr.Checkbox(label="Use PULS to auto-generate propositions and TL spec", value=False)
                mode_choice = gr.Dropdown(
                    choices=[
                        "Overall Consistency (default)",
                        "Object Existence",
                        "Spatial Relationship",
                        "Object-Action Alignment",
                    ],
                    label="PULS Mode",
                    visible=False,
                )
                generate_btn = gr.Button("Generate Propositions & TL Spec", visible=False)

            propositions_input = gr.Textbox(label="List of Propositions (comma-separated)", placeholder="A, B, C")
            tl_input = gr.Textbox(
                label="Temporal Logic Specification", placeholder="(A & B) U C - means A and B hold until C occurs"
            )

            process_btn = gr.Button("Process Video", variant="primary")

        with gr.Column():
            output_score = gr.Textbox(label="NeuS-V Score")
            gr.Markdown(
                """
                #### About the Score
                The NeuS-V score (0-1) measures how well your video matches the specified temporal logic conditions. A higher score indicates better alignment with the expected sequence of events.
                """
            )

    # Show/hide PULS controls based on checkbox
    use_puls_checkbox.change(
        fn=lambda x: (gr.update(visible=x), gr.update(visible=x)),
        inputs=[use_puls_checkbox],
        outputs=[mode_choice, generate_btn],
    )

    # Generate propositions and TL spec from natural language
    generate_btn.click(
        fn=generate_from_puls, inputs=[prompt_input, mode_choice], outputs=[propositions_input, tl_input]
    )

    # Process video with current propositions and TL spec
    process_btn.click(fn=demo_interface, inputs=[video_input, propositions_input, tl_input], outputs=[output_score])

    # Examples
    gr.Examples(
        examples=[
            [example_video_path_1, example_prompt, "Overall Consistency (default)", example_propositions, example_tl],
            [example_video_path_2, example_prompt, "Overall Consistency (default)", example_propositions, example_tl],
        ],
        inputs=[video_input, prompt_input, mode_choice, propositions_input, tl_input],
    )


if __name__ == "__main__":
    demo.launch(allowed_paths=["assets/"], server_name="0.0.0.0", server_port=7860)