Spaces:

yuyutsu07
/

Pseudo3D

Running on Zero

App Files Files Community

yuyutsu07 commited on Mar 11

Commit

18e62f1

verified ·

1 Parent(s): 09c414e

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -147

app.py CHANGED Viewed

@@ -7,182 +7,156 @@ from torchvision.transforms import ToTensor, Resize
 import spaces
 import tempfile
 from scipy.ndimage import gaussian_filter
-from huggingface_hub import hf_hub_download
-from safetensors.torch import load_file
-# ------------------------- AuraSR Model Definition ------------------------- #
-class ResBlock(torch.nn.Module):
-    def __init__(self, n_filters):
-        super().__init__()
-        self.conv1 = torch.nn.Conv2d(n_filters, n_filters, 3, padding=1)
-        self.conv2 = torch.nn.Conv2d(n_filters, n_filters, 3, padding=1)
-    def forward(self, x):
-        residual = x
-        x = torch.relu(self.conv1(x))
-        x = self.conv2(x)
-        x += residual
-        return x
-class AuraSR(torch.nn.Module):
-    def __init__(self, scale=4, n_filters=64, n_blocks=8):
-        super().__init__()
-        self.scale = scale
-        self.head = torch.nn.Conv2d(3, n_filters, 3, padding=1)
-        self.body = torch.nn.Sequential(*[ResBlock(n_filters) for _ in range(n_blocks)])
-        self.tail = torch.nn.Sequential(
-            torch.nn.Conv2d(n_filters, n_filters * (scale ** 2), 3, padding=1),
-            torch.nn.PixelShuffle(scale),
-            torch.nn.Conv2d(n_filters, 3, 3, padding=1)
-        )
-    def forward(self, x):
-        x = torch.nn.functional.interpolate(x, scale_factor=self.scale, mode='nearest')
-        x = self.head(x)
-        x = self.body(x)
-        x = self.tail(x)
-        return x
-# Load AuraSR-v2 model
-model_path = hf_hub_download(repo_id="fal/AuraSR-v2", filename="model.safetensors")
-state_dict = load_file(model_path)
-upscaler_model = AuraSR().eval().to('cuda')
-upscaler_model.load_state_dict(state_dict)
-# ------------------------- Core Parallax Function ------------------------- #
 @spaces.GPU
-def generate_parallax_video(image, depth_map, animation_style, amplitude, k, fps,
-                           duration, ssaa_factor, use_taa, use_upscaler):
-    """Generate parallax video with optional super-resolution upscaling"""
     if image.size != depth_map.size:
-        raise ValueError("Image and depth map dimensions must match")
-    # Preprocess inputs
     image_tensor = ToTensor()(image).to('cuda', dtype=torch.float32)
     depth_tensor = ToTensor()(depth_map.convert('L')).to('cuda', dtype=torch.float32)
     depth_tensor = (depth_tensor - depth_tensor.min()) / (depth_tensor.max() - depth_tensor.min() + 1e-6)
-    # Apply Gaussian smoothing to depth map
-    depth_np = gaussian_filter(depth_tensor.squeeze().cpu().numpy(), sigma=1)
-    depth_tensor = torch.tensor(depth_np, device='cuda').unsqueeze(0)
-    # Super Sampling Anti-Aliasing
     if ssaa_factor > 1:
         upscale = Resize((int(image.height * ssaa_factor), int(image.width * ssaa_factor)), antialias=True)
         image_tensor = upscale(image_tensor)
         depth_tensor = upscale(depth_tensor)
     H, W = image_tensor.shape[1], image_tensor.shape[2]
-    x, y = torch.meshgrid(torch.arange(W, device='cuda'), torch.arange(H, device='cuda'), indexing='xy')
-    pixel_grid = torch.stack((x, y), dim=-1)
-    # Animation parameters
     num_frames = int(fps * duration)
     frames = []
     prev_frame = None
-    for frame_idx in range(num_frames):
-        t = frame_idx / num_frames
-        camera_x, camera_y = calculate_movement(t, amplitude, animation_style)
-        # Calculate displacement
         displacement_x = k * camera_x * depth_tensor.squeeze()
         displacement_y = k * camera_y * depth_tensor.squeeze()
-        # Warp image
-        warped = warp_image(image_tensor, pixel_grid, displacement_x, displacement_y, W, H)
-        # Post-processing
-        frame_img = post_process_frame(warped, ssaa_factor, image.size, use_taa, prev_frame)
-        # Apply super-resolution
-        if use_upscaler:
-            frame_img = apply_upscaler(frame_img)
         frames.append(frame_img)
         prev_frame = frame_img.copy() if use_taa else None
-    return save_video(frames, fps)
-# ------------------------- Helper Functions ------------------------- #
-def calculate_movement(t, amplitude, style):
-    """Calculate camera movement based on animation style"""
-    if style == "horizontal":
-        return amplitude * np.sin(2*np.pi*t), 0
-    elif style == "vertical":
-        return 0, amplitude * np.sin(2*np.pi*t)
-    elif style == "circle":
-        return amplitude*np.sin(2*np.pi*t), amplitude*np.cos(2*np.pi*t)
-    elif style == "spiral":
-        radius = amplitude * (1 - t)
-        return radius*np.sin(4*np.pi*t), radius*np.cos(4*np.pi*t)
-def warp_image(image_tensor, pixel_grid, dx, dy, W, H):
-    """Warp image using computed displacements"""
-    source_x = pixel_grid[:, :, 0] + dx
-    source_y = pixel_grid[:, :, 1] + dy
-    grid = torch.stack((2*source_x/(W-1)-1, 2*source_y/(H-1)-1), dim=-1).unsqueeze(0)
-    return torch.nn.functional.grid_sample(image_tensor.unsqueeze(0), grid, mode='bicubic', align_corners=True)
-def post_process_frame(warped, ssaa_factor, orig_size, use_taa, prev_frame):
-    """Process frame with SSAA and TAA"""
-    if ssaa_factor > 1:
-        warped = Resize(orig_size[::-1], antialias=True)(warped.squeeze(0)).unsqueeze(0)
-    frame = (warped.squeeze().permute(1,2,0).cpu().numpy() * 255).astype(np.uint8)
-    if use_taa and prev_frame is not None:
-        frame = cv2.addWeighted(frame, 0.8, prev_frame, 0.2, 0)
-    return frame
-def apply_upscaler(frame):
-    """Apply 4x super-resolution using AuraSR-v2"""
-    tensor = torch.tensor(frame).permute(2,0,1).unsqueeze(0).float() / 255.0
-    with torch.no_grad():
-        upscaled = upscaler_model(tensor.to('cuda'))
-    return (upscaled[0].permute(1,2,0).clamp(0,1).cpu().numpy() * 255).astype(np.uint8)
-def save_video(frames, fps):
-    """Save frames to video file"""
-    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
-        writer = imageio.get_writer(f.name, fps=fps, codec='libx264', quality=9)
         for frame in frames:
             writer.append_data(frame)
         writer.close()
-        return f.name
-# ------------------------- Gradio Interface ------------------------- #
-with gr.Blocks(title="3D Parallax Video Generator with Super-Resolution") as demo:
-    gr.Markdown("# 🔥 3D Parallax Video Generator with 4x Super-Resolution")
-    gr.Markdown("Generate stunning 3D parallax videos from 2D images with optional AI upscaling")
-    with gr.Row():
-        image_input = gr.Image(type="pil", label="Input Image")
-        depth_input = gr.Image(type="pil", label="Depth Map")
-    with gr.Row():
-        with gr.Column():
-            animation_style = gr.Dropdown(["horizontal", "vertical", "circle", "spiral"],
-                                        value="horizontal", label="Animation Style")
-            amplitude = gr.Slider(0, 10, value=2, step=0.1, label="Movement Amplitude")
-            k = gr.Slider(0, 20, value=5, step=0.1, label="Depth Scaling Factor")
-        with gr.Column():
-            fps = gr.Slider(10, 60, value=30, step=1, label="FPS")
-            duration = gr.Slider(1, 10, value=5, step=0.1, label="Duration (seconds)")
-            ssaa_factor = gr.Dropdown([1, 2, 4], value=1, label="Anti-Aliasing Quality")
     with gr.Row():
-        use_taa = gr.Checkbox(label="Enable Temporal Anti-Aliasing", value=False)
-        use_upscaler = gr.Checkbox(label="Enable 4x Super-Resolution (AuraSR-v2)", value=False)
-    generate_btn = gr.Button("Generate Video", variant="primary")
-    video_output = gr.Video(label="Generated Video", format="mp4")
-    generate_btn.click(fn=generate_parallax_video,
-                      inputs=[image_input, depth_input, animation_style, amplitude, k,
-                             fps, duration, ssaa_factor, use_taa, use_upscaler],
-                      outputs=video_output)
 demo.launch()

 import spaces
 import tempfile
 from scipy.ndimage import gaussian_filter
+from aura_sr import AuraSR  # Import AuraSR for upscaling
+# Load AuraSR-v2 model once at startup
+aura_sr = AuraSR.from_pretrained("fal/AuraSR-v2")
 @spaces.GPU
+def generate_parallax_video(image, depth_map, animation_style, amplitude, k, fps, duration, ssaa_factor, use_taa, use_upscale):
+    """
+    Generate a 3D parallax video with enhanced quality features and optional upscaling.
+    Args:
+        image (PIL.Image): Input RGB image.
+        depth_map (PIL.Image): Grayscale depth map.
+        animation_style (str): Animation type.
+        amplitude (float): Camera movement intensity.
+        k (float): Depth displacement scale.
+        fps (int): Frames per second.
+        duration (float): Video duration in seconds.
+        ssaa_factor (int): Super sampling factor (1, 2, 4).
+        use_taa (bool): Enable temporal anti-aliasing.
+        use_upscale (bool): Enable AuraSR-v2 upscaling for each frame.
+    Returns:
+        str: Path to the generated video file.
+    """
+    # Validate input dimensions
     if image.size != depth_map.size:
+        raise ValueError("Image and depth map must have the same dimensions")
+    # Convert to tensors with high precision
     image_tensor = ToTensor()(image).to('cuda', dtype=torch.float32)
     depth_tensor = ToTensor()(depth_map.convert('L')).to('cuda', dtype=torch.float32)
     depth_tensor = (depth_tensor - depth_tensor.min()) / (depth_tensor.max() - depth_tensor.min() + 1e-6)
+    # Smooth depth map
+    depth_np = depth_tensor.squeeze().cpu().numpy()
+    depth_np = gaussian_filter(depth_np, sigma=1)
+    depth_tensor = torch.tensor(depth_np, device='cuda', dtype=torch.float32).unsqueeze(0)
+    # Apply SSAA
     if ssaa_factor > 1:
         upscale = Resize((int(image.height * ssaa_factor), int(image.width * ssaa_factor)), antialias=True)
         image_tensor = upscale(image_tensor)
         depth_tensor = upscale(depth_tensor)
     H, W = image_tensor.shape[1], image_tensor.shape[2]
+    # Create coordinate grid
+    x = torch.arange(0, W).float().to('cuda')
+    y = torch.arange(0, H).float().to('cuda')
+    xx, yy = torch.meshgrid(x, y, indexing='xy')
+    pixel_grid = torch.stack((xx, yy), dim=-1)
+    # Generate frames
     num_frames = int(fps * duration)
     frames = []
     prev_frame = None
+    for frame in range(num_frames):
+        t = frame / num_frames
+        if animation_style == "horizontal":
+            camera_x = amplitude * np.sin(2 * np.pi * t)
+            camera_y = 0
+        elif animation_style == "vertical":
+            camera_x = 0
+            camera_y = amplitude * np.sin(2 * np.pi * t)
+        elif animation_style == "circle":
+            camera_x = amplitude * np.sin(2 * np.pi * t)
+            camera_y = amplitude * np.cos(2 * np.pi * t)
+        elif animation_style == "spiral":
+            radius = amplitude * (1 - t)
+            camera_x = radius * np.sin(4 * np.pi * t)
+            camera_y = radius * np.cos(4 * np.pi * t)
+        else:
+            raise ValueError(f"Unsupported animation style: {animation_style}")
+        # Compute displacements
         displacement_x = k * camera_x * depth_tensor.squeeze()
         displacement_y = k * camera_y * depth_tensor.squeeze()
+        # Calculate source coordinates
+        source_pixel_x = pixel_grid[:, :, 0] + displacement_x
+        source_pixel_y = pixel_grid[:, :, 1] + displacement_y
+        # Normalize to [-1, 1]
+        grid_x = 2 * source_pixel_x / (W - 1) - 1
+        grid_y = 2 * source_pixel_y / (H - 1) - 1
+        grid = torch.stack((grid_x, grid_y), dim=-1).unsqueeze(0)
+        # Warp with bicubic interpolation
+        warped = torch.nn.functional.grid_sample(image_tensor.unsqueeze(0), grid, mode='bicubic', align_corners=True)
+        # Downsample if SSAA is enabled
+        if ssaa_factor > 1:
+            downscale = Resize((image.height, image.width), antialias=True)
+            warped = downscale(warped.squeeze(0)).unsqueeze(0)
+        # Convert to PIL image for upscaling or further processing
+        frame_img = warped.squeeze(0).permute(1, 2, 0).cpu().numpy()
+        frame_img = (frame_img * 255).astype(np.uint8)
+        frame_pil = Image.fromarray(frame_img)
+        # Apply AuraSR-v2 upscaling if enabled
+        if use_upscale:
+            frame_pil = aura_sr.upscale_4x_overlapped(frame_pil)  # 4x upscaling
+            frame_img = np.array(frame_pil)
+        # Apply TAA if enabled
+        if use_taa and prev_frame is not None:
+            frame_img = (frame_img * 0.8 + prev_frame * 0.2).astype(np.uint8)
         frames.append(frame_img)
         prev_frame = frame_img.copy() if use_taa else None
+    # Save video
+    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
+        output_path = tmpfile.name
+        writer = imageio.get_writer(output_path, fps=fps, codec='libx264')
         for frame in frames:
             writer.append_data(frame)
         writer.close()
+    return output_path
+# Gradio interface
+with gr.Blocks(title="Enhanced 3D Parallax Video Generator with Upscaling") as demo:
+    gr.Markdown("# Enhanced 3D Parallax Video Generator with Upscaling")
+    gr.Markdown("Create high-quality 3D parallax videos with advanced features and optional AuraSR-v2 upscaling.")
     with gr.Row():
+        image_input = gr.Image(type="pil", label="Upload Image")
+        depth_input = gr.Image(type="pil", label="Upload Depth Map")
+    with gr.Row():
+        animation_style = gr.Dropdown(["horizontal", "vertical", "circle", "spiral"], label="Animation Style", value="horizontal")
+        amplitude_slider = gr.Slider(0, 10, value=2, label="Amplitude", step=0.1)
+        k_slider = gr.Slider(1, 20, value=5, label="Depth Scale (k)", step=0.1)
+        fps_slider = gr.Slider(10, 60, value=30, label="FPS", step=1)
+        duration_slider = gr.Slider(1, 10, value=5, label="Duration (s)", step=0.1)
+        ssaa_factor = gr.Dropdown([1, 2, 4], label="SSAA Factor", value=1)
+        use_taa = gr.Checkbox(label="Enable TAA", value=False)
+        use_upscale = gr.Checkbox(label="Enable AuraSR-v2 Upscaling", value=False)
+    generate_btn = gr.Button("Generate Video")
+    video_output = gr.Video(label="Parallax Video")
+    generate_btn.click(
+        fn=generate_parallax_video,
+        inputs=[image_input, depth_input, animation_style, amplitude_slider, k_slider, fps_slider, duration_slider, ssaa_factor, use_taa, use_upscale],
+        outputs=video_output
+    )
 demo.launch()