SmolVLM2-HighlightGenerator

Running on Zero

App Files Files Community

Miquel Farre commited on Feb 18

Commit

524fae7

1 Parent(s): d068f9e

two highlights path

Browse files

Files changed (1) hide show

app.py +48 -15

app.py CHANGED Viewed

@@ -83,12 +83,17 @@ class VideoHighlightDetector:
         outputs = self.model.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.7)
         return self.processor.decode(outputs[0], skip_special_tokens=True).lower().split("assistant: ")[1]
-    def determine_highlights(self, video_description: str) -> str:
-        """Determine what constitutes highlights based on video description."""
         messages = [
             {
                 "role": "system",
-                "content": [{"type": "text", "text": "You are a highlight editor. List archetypal dramatic moments that would make compelling highlights if they appear in the video. Each moment should be specific enough to be recognizable but generic enough to potentially exist in any video of this type."}]
             },
             {
                 "role": "user",
@@ -96,6 +101,7 @@ class VideoHighlightDetector:
             }
         ]
         print(messages)
         inputs = self.processor.apply_chat_template(
@@ -299,15 +305,17 @@ def create_ui(examples_path: str, model_path: str):
                 formatted_desc = f"### Summary:\n {video_desc[:500] + '...' if len(video_desc) > 500 else video_desc}"
                 yield [
-                    "Determining highlight types...",
                     formatted_desc,
                     "",
                     gr.update(visible=False),
                     gr.update(visible=True)
                 ]
-                highlights = detector.determine_highlights(video_desc)
-                formatted_highlights = f"### Highlights to search for:\n {highlights[:500] + '...' if len(highlights) > 500 else highlights}"
                 # Split video into segments
                 temp_dir = "temp_segments"
@@ -315,7 +323,8 @@ def create_ui(examples_path: str, model_path: str):
                 segment_length = 10.0
                 duration = get_video_duration_seconds(video)
-                kept_segments = []
                 segments_processed = 0
                 total_segments = int(duration / segment_length)
@@ -348,10 +357,14 @@ def create_ui(examples_path: str, model_path: str):
                     ]
                     subprocess.run(cmd, check=True)
-                    # Process segment
-                    if detector.process_segment(segment_path, highlights):
-                        print("KEEPING SEGMENT")
-                        kept_segments.append((start_time, end_time))
                     # Clean up segment file
                     os.remove(segment_path)
@@ -359,14 +372,33 @@ def create_ui(examples_path: str, model_path: str):
                 # Remove temp directory
                 os.rmdir(temp_dir)
                 # Create final video
-                if kept_segments:
                     with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_file:
                         temp_output = tmp_file.name
-                        detector._concatenate_scenes(video, kept_segments, temp_output)
                     yield [
-                        "Processing complete!",
                         formatted_desc,
                         formatted_highlights,
                         gr.update(value=temp_output, visible=True),
@@ -374,7 +406,7 @@ def create_ui(examples_path: str, model_path: str):
                     ]
                 else:
                     yield [
-                        "No highlights detected in the video.",
                         formatted_desc,
                         formatted_highlights,
                         gr.update(visible=False),
@@ -394,6 +426,7 @@ def create_ui(examples_path: str, model_path: str):
                 # Clean up
                 torch.cuda.empty_cache()
         process_btn.click(
             on_process,
             inputs=[input_video],

         outputs = self.model.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.7)
         return self.processor.decode(outputs[0], skip_special_tokens=True).lower().split("assistant: ")[1]
+    def determine_highlights(self, video_description: str, prompt_num: int = 1) -> str:
+        """Determine what constitutes highlights based on video description with different prompts."""
+        system_prompts = {
+            1: "You are a highlight editor. List archetypal dramatic moments that would make compelling highlights if they appear in the video. Each moment should be specific enough to be recognizable but generic enough to potentially exist in any video of this type.",
+            2: "You are a highlight editor focusing on subtle and nuanced moments. List quieter, character-driven, or atmospheric moments that would make interesting highlights. Focus on moments that might be overlooked by traditional highlight detection but add depth to the story."
+        }
         messages = [
             {
                 "role": "system",
+                "content": [{"type": "text", "text": system_prompts[prompt_num]}]
             },
             {
                 "role": "user",
             }
         ]
+        print(f"Using prompt {prompt_num} for highlight detection")
         print(messages)
         inputs = self.processor.apply_chat_template(
                 formatted_desc = f"### Summary:\n {video_desc[:500] + '...' if len(video_desc) > 500 else video_desc}"
                 yield [
+                    "Determining highlight types (2 variations)...",
                     formatted_desc,
                     "",
                     gr.update(visible=False),
                     gr.update(visible=True)
                 ]
+                # Get two different sets of highlights
+                highlights1 = detector.determine_highlights(video_desc, prompt_num=1)
+                highlights2 = detector.determine_highlights(video_desc, prompt_num=2)
+                formatted_highlights = f"### Highlights to search for:\nSet 1:\n{highlights1[:500] + '...' if len(highlights1) > 500 else highlights1}\n\nSet 2:\n{highlights2[:500] + '...' if len(highlights2) > 500 else highlights2}"
                 # Split video into segments
                 temp_dir = "temp_segments"
                 segment_length = 10.0
                 duration = get_video_duration_seconds(video)
+                kept_segments1 = []
+                kept_segments2 = []
                 segments_processed = 0
                 total_segments = int(duration / segment_length)
                     ]
                     subprocess.run(cmd, check=True)
+                    # Process segment with both highlight sets
+                    if detector.process_segment(segment_path, highlights1):
+                        print("KEEPING SEGMENT FOR SET 1")
+                        kept_segments1.append((start_time, end_time))
+                    if detector.process_segment(segment_path, highlights2):
+                        print("KEEPING SEGMENT FOR SET 2")
+                        kept_segments2.append((start_time, end_time))
                     # Clean up segment file
                     os.remove(segment_path)
                 # Remove temp directory
                 os.rmdir(temp_dir)
+                # Calculate percentages of video kept for each highlight set
+                total_duration = duration
+                duration1 = sum(end - start for start, end in kept_segments1)
+                duration2 = sum(end - start for start, end in kept_segments2)
+                percent1 = (duration1 / total_duration) * 100
+                percent2 = (duration2 / total_duration) * 100
+                print(f"Highlight set 1: {percent1:.1f}% of video")
+                print(f"Highlight set 2: {percent2:.1f}% of video")
+                # Choose the set with lower percentage unless it's zero
+                final_segments = kept_segments2 if (0 < percent2 <= percent1 or percent1 == 0) else kept_segments1
                 # Create final video
+                if final_segments:
                     with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_file:
                         temp_output = tmp_file.name
+                        detector._concatenate_scenes(video, final_segments, temp_output)
+                    selected_set = "2" if final_segments == kept_segments2 else "1"
+                    percent_used = percent2 if final_segments == kept_segments2 else percent1
+                    completion_message = f"Processing complete! Used highlight set {selected_set} ({percent_used:.1f}% of video)"
                     yield [
+                        completion_message,
                         formatted_desc,
                         formatted_highlights,
                         gr.update(value=temp_output, visible=True),
                     ]
                 else:
                     yield [
+                        "No highlights detected in the video with either set of criteria.",
                         formatted_desc,
                         formatted_highlights,
                         gr.update(visible=False),
                 # Clean up
                 torch.cuda.empty_cache()
         process_btn.click(
             on_process,
             inputs=[input_video],