File size: 3,346 Bytes
f269277
8d3e73e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32371f9
8d3e73e
 
 
 
f269277
8d3e73e
 
f269277
8d3e73e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f269277
 
 
 
 
8d3e73e
f269277
8d3e73e
f269277
 
 
 
32371f9
 
8d3e73e
 
 
f269277
 
 
 
8d3e73e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import argparse
import pickle
import warnings
from pathlib import Path

from neus_v.smooth_scoring import smooth_confidence_scores
from neus_v.utils import clear_gpu_memory
from neus_v.veval.eval import evaluate_video_with_sequence_of_images
from neus_v.veval.parse import parse_proposition_set, parse_tl_specification
from neus_v.vlm.internvl import InternVL

# Suppress specific warnings
warnings.filterwarnings(
    "ignore", category=DeprecationWarning, message="Conversion of an array with ndim > 0 to a scalar is deprecated"
)

# Paths and parameters
WEIGHT_PATH = Path("/nas/mars/model_weights/")
pickle_path = WEIGHT_PATH / "distributions.pkl"
num_of_frame_in_sequence = 3
model = "InternVL2-8B"
device = 7

# Load the vision-language model
vision_language_model = InternVL(model_name=model, device=device)

# Load distributions
with open(pickle_path, "rb") as f:
    distributions = pickle.load(f)
all_dimension_data = distributions.get(model).get("all_dimension")


def process_video(video_path, propositions, tl):
    """Process the video and compute the score_on_all."""
    proposition_set = parse_proposition_set(propositions.split(","))
    tl_spec = parse_tl_specification(tl)
    threshold = 0.349

    try:
        result = evaluate_video_with_sequence_of_images(
            vision_language_model=vision_language_model,
            confidence_as_token_probability=True,
            video_path=video_path,
            proposition_set=proposition_set,
            tl_spec=tl_spec,
            parallel_inference=False,
            num_of_frame_in_sequence=num_of_frame_in_sequence,
            threshold=threshold,
        )
        probability = result.get("probability")
        score_on_all = float(
            smooth_confidence_scores(
                target_data=[probability],
                prior_distribution=all_dimension_data,
            )
        )
        clear_gpu_memory()
        return score_on_all

    except Exception as e:
        clear_gpu_memory()
        return f"Error: {str(e)}"


def main():
    # parser = argparse.ArgumentParser(description="Process a video using temporal logic evaluation.")
    # parser.add_argument("video", type=str, help="Path to the video file.")
    # parser.add_argument("propositions", type=str, help="List of propositions (comma-separated).")
    # parser.add_argument("tl", type=str, help="Temporal logic specification.")

    # args = parser.parse_args()

    # score = process_video(args.video, args.propositions, args.tl)
    # print(f"Score on All: {score}")

    # Example usage
    example_video_path_1 = "/nas/mars/dataset/teaser/A_storm_bursts_in_with_intermittent_lightning_and_causes_flooding_and_large_waves_crash_in.mp4"
    example_video_path_2 = "/nas/mars/dataset/teaser/The ocean waves gently lapping at the shore, until a storm bursts in, and then lightning flashes across the sky.mp4"
    example_propositions = "waves lapping,ocean shore,storm bursts in,lightning on the sky"
    example_tl = '("waves_lapping" & "ocean_shore") U ("storm_bursts_in" U "lightning_on_the_sky")'

    print("Example 1:")
    print(f"Score: {process_video(example_video_path_1, example_propositions, example_tl)}")
    print("Example 2:")
    print(f"Score: {process_video(example_video_path_2, example_propositions, example_tl)}")


if __name__ == "__main__":
    main()