Spaces:

yuyutsu07
/

Pseudo3D

Running on Zero

App Files Files Community

yuyutsu07 commited on Mar 11

Commit

d494365

verified ·

1 Parent(s): 16f4e59

Update app.py

Browse files

Files changed (1) hide show

app.py +149 -111

app.py CHANGED Viewed

@@ -7,144 +7,182 @@ from torchvision.transforms import ToTensor, Resize
 import spaces
 import tempfile
 from scipy.ndimage import gaussian_filter
 @spaces.GPU
-def generate_parallax_video(image, depth_map, animation_style, amplitude, k, fps, duration, ssaa_factor, use_taa):
-    """
-    Generate a 3D parallax video with enhanced quality features.
-    Args:
-        image (PIL.Image): Input RGB image.
-        depth_map (PIL.Image): Grayscale depth map.
-        animation_style (str): Animation type (e.g., horizontal, spiral).
-        amplitude (float): Camera movement intensity.
-        k (float): Depth displacement scale.
-        fps (int): Frames per second.
-        duration (float): Video duration in seconds.
-        ssaa_factor (int): Super sampling factor (1, 2, 4).
-        use_taa (bool): Enable temporal anti-aliasing.
-    Returns:
-        str: Path to the generated video file.
-    """
-    # Validate input dimensions
     if image.size != depth_map.size:
-        raise ValueError("Image and depth map must have the same dimensions")
-    # Convert to tensors with high precision
     image_tensor = ToTensor()(image).to('cuda', dtype=torch.float32)
     depth_tensor = ToTensor()(depth_map.convert('L')).to('cuda', dtype=torch.float32)
     depth_tensor = (depth_tensor - depth_tensor.min()) / (depth_tensor.max() - depth_tensor.min() + 1e-6)
-    # Smooth depth map to improve intersections
-    depth_np = depth_tensor.squeeze().cpu().numpy()
-    depth_np = gaussian_filter(depth_np, sigma=1)  # Basic smoothing
-    depth_tensor = torch.tensor(depth_np, device='cuda', dtype=torch.float32).unsqueeze(0)
-    # Apply SSAA: upscale image and depth map
     if ssaa_factor > 1:
         upscale = Resize((int(image.height * ssaa_factor), int(image.width * ssaa_factor)), antialias=True)
         image_tensor = upscale(image_tensor)
         depth_tensor = upscale(depth_tensor)
     H, W = image_tensor.shape[1], image_tensor.shape[2]
-    # Create coordinate grid
-    x = torch.arange(0, W).float().to('cuda')
-    y = torch.arange(0, H).float().to('cuda')
-    xx, yy = torch.meshgrid(x, y, indexing='xy')
-    pixel_grid = torch.stack((xx, yy), dim=-1)
-    # Generate frames
     num_frames = int(fps * duration)
     frames = []
     prev_frame = None
-    for frame in range(num_frames):
-        t = frame / num_frames
-        if animation_style == "horizontal":
-            camera_x = amplitude * np.sin(2 * np.pi * t)
-            camera_y = 0
-        elif animation_style == "vertical":
-            camera_x = 0
-            camera_y = amplitude * np.sin(2 * np.pi * t)
-        elif animation_style == "circle":
-            camera_x = amplitude * np.sin(2 * np.pi * t)
-            camera_y = amplitude * np.cos(2 * np.pi * t)
-        elif animation_style == "spiral":  # Inspired by DepthFlow
-            radius = amplitude * (1 - t)
-            camera_x = radius * np.sin(4 * np.pi * t)
-            camera_y = radius * np.cos(4 * np.pi * t)
-        else:
-            raise ValueError(f"Unsupported animation style: {animation_style}")
-        # Compute displacements
         displacement_x = k * camera_x * depth_tensor.squeeze()
         displacement_y = k * camera_y * depth_tensor.squeeze()
-        # Calculate source coordinates
-        source_pixel_x = pixel_grid[:, :, 0] + displacement_x
-        source_pixel_y = pixel_grid[:, :, 1] + displacement_y
-        # Normalize to [-1, 1]
-        grid_x = 2 * source_pixel_x / (W - 1) - 1
-        grid_y = 2 * source_pixel_y / (H - 1) - 1
-        grid = torch.stack((grid_x, grid_y), dim=-1).unsqueeze(0)
-        # Warp with high-quality interpolation
-        warped = torch.nn.functional.grid_sample(image_tensor.unsqueeze(0), grid, mode='bicubic', align_corners=True)
-        # Downsample if SSAA is enabled
-        if ssaa_factor > 1:
-            downscale = Resize((image.height, image.width), antialias=True)
-            warped = downscale(warped.squeeze(0)).unsqueeze(0)
-        # Convert to numpy
-        frame_img = warped.squeeze(0).permute(1, 2, 0).cpu().numpy()
-        frame_img = (frame_img * 255).astype(np.uint8)
-        # Apply TAA if enabled
-        if use_taa and prev_frame is not None:
-            frame_img = (frame_img * 0.8 + prev_frame * 0.2).astype(np.uint8)
         frames.append(frame_img)
-        prev_frame = frame_img
-    # Save video
-    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
-        output_path = tmpfile.name
-        writer = imageio.get_writer(output_path, fps=fps, codec='libx264')
         for frame in frames:
             writer.append_data(frame)
         writer.close()
-    return output_path
-# Gradio interface
-with gr.Blocks(title="Enhanced 3D Parallax Video Generator") as demo:
-    gr.Markdown("# Enhanced 3D Parallax Video Generator")
-    gr.Markdown("Create high-quality 3D parallax videos with advanced features.")
     with gr.Row():
-        image_input = gr.Image(type="pil", label="Upload Image")
-        depth_input = gr.Image(type="pil", label="Upload Depth Map")
     with gr.Row():
-        animation_style = gr.Dropdown(["horizontal", "vertical", "circle", "spiral"], label="Animation Style", value="horizontal")
-        amplitude_slider = gr.Slider(0, 10, value=2, label="Amplitude", step=0.1)
-        k_slider = gr.Slider(1, 20, value=5, label="Depth Scale (k)", step=0.1)
-        fps_slider = gr.Slider(10, 60, value=30, label="FPS", step=1)
-        duration_slider = gr.Slider(1, 10, value=5, label="Duration (s)", step=0.1)
-        ssaa_factor = gr.Dropdown([1, 2, 4], label="SSAA Factor", value=1)
-        use_taa = gr.Checkbox(label="Enable TAA", value=False)
-    generate_btn = gr.Button("Generate Video")
-    video_output = gr.Video(label="Parallax Video")
-    generate_btn.click(
-        fn=generate_parallax_video,
-        inputs=[image_input, depth_input, animation_style, amplitude_slider, k_slider, fps_slider, duration_slider, ssaa_factor, use_taa],
-        outputs=video_output
-    )
 demo.launch()

 import spaces
 import tempfile
 from scipy.ndimage import gaussian_filter
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+# ------------------------- AuraSR Model Definition ------------------------- #
+class ResBlock(torch.nn.Module):
+    def __init__(self, n_filters):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(n_filters, n_filters, 3, padding=1)
+        self.conv2 = torch.nn.Conv2d(n_filters, n_filters, 3, padding=1)
+    def forward(self, x):
+        residual = x
+        x = torch.relu(self.conv1(x))
+        x = self.conv2(x)
+        x += residual
+        return x
+class AuraSR(torch.nn.Module):
+    def __init__(self, scale=4, n_filters=64, n_blocks=8):
+        super().__init__()
+        self.scale = scale
+        self.head = torch.nn.Conv2d(3, n_filters, 3, padding=1)
+        self.body = torch.nn.Sequential(*[ResBlock(n_filters) for _ in range(n_blocks)])
+        self.tail = torch.nn.Sequential(
+            torch.nn.Conv2d(n_filters, n_filters * (scale ** 2), 3, padding=1),
+            torch.nn.PixelShuffle(scale),
+            torch.nn.Conv2d(n_filters, 3, 3, padding=1)
+        )
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=self.scale, mode='nearest')
+        x = self.head(x)
+        x = self.body(x)
+        x = self.tail(x)
+        return x
+# Load AuraSR-v2 model
+model_path = hf_hub_download(repo_id="fal/AuraSR-v2", filename="model.safetensors")
+state_dict = load_file(model_path)
+upscaler_model = AuraSR().eval().to('cuda')
+upscaler_model.load_state_dict(state_dict)
+# ------------------------- Core Parallax Function ------------------------- #
 @spaces.GPU
+def generate_parallax_video(image, depth_map, animation_style, amplitude, k, fps,
+                           duration, ssaa_factor, use_taa, use_upscaler):
+    """Generate parallax video with optional super-resolution upscaling"""
     if image.size != depth_map.size:
+        raise ValueError("Image and depth map dimensions must match")
+    # Preprocess inputs
     image_tensor = ToTensor()(image).to('cuda', dtype=torch.float32)
     depth_tensor = ToTensor()(depth_map.convert('L')).to('cuda', dtype=torch.float32)
     depth_tensor = (depth_tensor - depth_tensor.min()) / (depth_tensor.max() - depth_tensor.min() + 1e-6)
+    # Apply Gaussian smoothing to depth map
+    depth_np = gaussian_filter(depth_tensor.squeeze().cpu().numpy(), sigma=1)
+    depth_tensor = torch.tensor(depth_np, device='cuda').unsqueeze(0)
+    # Super Sampling Anti-Aliasing
     if ssaa_factor > 1:
         upscale = Resize((int(image.height * ssaa_factor), int(image.width * ssaa_factor)), antialias=True)
         image_tensor = upscale(image_tensor)
         depth_tensor = upscale(depth_tensor)
     H, W = image_tensor.shape[1], image_tensor.shape[2]
+    x, y = torch.meshgrid(torch.arange(W, device='cuda'), torch.arange(H, device='cuda'), indexing='xy')
+    pixel_grid = torch.stack((x, y), dim=-1)
+    # Animation parameters
     num_frames = int(fps * duration)
     frames = []
     prev_frame = None
+    for frame_idx in range(num_frames):
+        t = frame_idx / num_frames
+        camera_x, camera_y = calculate_movement(t, amplitude, animation_style)
+        # Calculate displacement
         displacement_x = k * camera_x * depth_tensor.squeeze()
         displacement_y = k * camera_y * depth_tensor.squeeze()
+        # Warp image
+        warped = warp_image(image_tensor, pixel_grid, displacement_x, displacement_y, W, H)
+        # Post-processing
+        frame_img = post_process_frame(warped, ssaa_factor, image.size, use_taa, prev_frame)
+        # Apply super-resolution
+        if use_upscaler:
+            frame_img = apply_upscaler(frame_img)
         frames.append(frame_img)
+        prev_frame = frame_img.copy() if use_taa else None
+    return save_video(frames, fps)
+# ------------------------- Helper Functions ------------------------- #
+def calculate_movement(t, amplitude, style):
+    """Calculate camera movement based on animation style"""
+    if style == "horizontal":
+        return amplitude * np.sin(2*np.pi*t), 0
+    elif style == "vertical":
+        return 0, amplitude * np.sin(2*np.pi*t)
+    elif style == "circle":
+        return amplitude*np.sin(2*np.pi*t), amplitude*np.cos(2*np.pi*t)
+    elif style == "spiral":
+        radius = amplitude * (1 - t)
+        return radius*np.sin(4*np.pi*t), radius*np.cos(4*np.pi*t)
+def warp_image(image_tensor, pixel_grid, dx, dy, W, H):
+    """Warp image using computed displacements"""
+    source_x = pixel_grid[:, :, 0] + dx
+    source_y = pixel_grid[:, :, 1] + dy
+    grid = torch.stack((2*source_x/(W-1)-1, 2*source_y/(H-1)-1), dim=-1).unsqueeze(0)
+    return torch.nn.functional.grid_sample(image_tensor.unsqueeze(0), grid, mode='bicubic', align_corners=True)
+def post_process_frame(warped, ssaa_factor, orig_size, use_taa, prev_frame):
+    """Process frame with SSAA and TAA"""
+    if ssaa_factor > 1:
+        warped = Resize(orig_size[::-1], antialias=True)(warped.squeeze(0)).unsqueeze(0)
+    frame = (warped.squeeze().permute(1,2,0).cpu().numpy() * 255).astype(np.uint8)
+    if use_taa and prev_frame is not None:
+        frame = cv2.addWeighted(frame, 0.8, prev_frame, 0.2, 0)
+    return frame
+def apply_upscaler(frame):
+    """Apply 4x super-resolution using AuraSR-v2"""
+    tensor = torch.tensor(frame).permute(2,0,1).unsqueeze(0).float() / 255.0
+    with torch.no_grad():
+        upscaled = upscaler_model(tensor.to('cuda'))
+    return (upscaled[0].permute(1,2,0).clamp(0,1).cpu().numpy() * 255).astype(np.uint8)
+def save_video(frames, fps):
+    """Save frames to video file"""
+    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
+        writer = imageio.get_writer(f.name, fps=fps, codec='libx264', quality=9)
         for frame in frames:
             writer.append_data(frame)
         writer.close()
+        return f.name
+# ------------------------- Gradio Interface ------------------------- #
+with gr.Blocks(title="3D Parallax Video Generator with Super-Resolution") as demo:
+    gr.Markdown("# 🔥 3D Parallax Video Generator with 4x Super-Resolution")
+    gr.Markdown("Generate stunning 3D parallax videos from 2D images with optional AI upscaling")
     with gr.Row():
+        image_input = gr.Image(type="pil", label="Input Image")
+        depth_input = gr.Image(type="pil", label="Depth Map")
+    with gr.Row():
+        with gr.Column():
+            animation_style = gr.Dropdown(["horizontal", "vertical", "circle", "spiral"],
+                                        value="horizontal", label="Animation Style")
+            amplitude = gr.Slider(0, 10, value=2, step=0.1, label="Movement Amplitude")
+            k = gr.Slider(0, 20, value=5, step=0.1, label="Depth Scaling Factor")
+        with gr.Column():
+            fps = gr.Slider(10, 60, value=30, step=1, label="FPS")
+            duration = gr.Slider(1, 10, value=5, step=0.1, label="Duration (seconds)")
+            ssaa_factor = gr.Dropdown([1, 2, 4], value=1, label="Anti-Aliasing Quality")
     with gr.Row():
+        use_taa = gr.Checkbox(label="Enable Temporal Anti-Aliasing", value=False)
+        use_upscaler = gr.Checkbox(label="Enable 4x Super-Resolution (AuraSR-v2)", value=False)
+    generate_btn = gr.Button("Generate Video", variant="primary")
+    video_output = gr.Video(label="Generated Video", format="mp4")
+    generate_btn.click(fn=generate_parallax_video,
+                      inputs=[image_input, depth_input, animation_style, amplitude, k,
+                             fps, duration, ssaa_factor, use_taa, use_upscaler],
+                      outputs=video_output)
 demo.launch()