Spaces:

YiftachEde
/

Sharp-It

Running on Zero

App Files Files Community

YiftachEde commited on Mar 1

Commit

0f41ba2

1 Parent(s): 805a8bb

Adding app.py

Browse files

Files changed (2) hide show

app.py +419 -0
app2.py +419 -0

app.py ADDED Viewed

	@@ -0,0 +1,419 @@

+import os
+import torch
+import torch.nn as nn
+import gradio as gr
+import numpy as np
+from PIL import Image
+from omegaconf import OmegaConf
+from pytorch_lightning import seed_everything
+from huggingface_hub import hf_hub_download
+from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler
+from einops import rearrange
+from shap_e.diffusion.sample import sample_latents
+from shap_e.diffusion.gaussian_diffusion import diffusion_from_config
+from shap_e.models.download import load_model, load_config
+from shap_e.util.notebooks import create_pan_cameras, decode_latent_images, create_custom_cameras
+from src.utils.train_util import instantiate_from_config
+from src.utils.camera_util import (
+    FOV_to_intrinsics,
+    get_zero123plus_input_cameras,
+    get_circular_camera_poses,
+    spherical_camera_pose
+)
+from src.utils.mesh_util import save_obj, save_glb
+from src.utils.infer_util import remove_background, resize_foreground
+def load_models():
+    """Initialize and load all required models"""
+    config = OmegaConf.load('configs/instant-nerf-large-best.yaml')
+    model_config = config.model_config
+    infer_config = config.infer_config
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    # Load diffusion pipeline
+    print('Loading diffusion pipeline...')
+    pipeline = DiffusionPipeline.from_pretrained(
+        "sudo-ai/zero123plus-v1.2",
+        custom_pipeline="zero123plus",
+        torch_dtype=torch.float16
+    )
+    pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
+        pipeline.scheduler.config, timestep_spacing='trailing'
+    )
+    # Modify UNet to handle 8 input channels instead of 4
+    in_channels = 8
+    out_channels = pipeline.unet.conv_in.out_channels
+    pipeline.unet.register_to_config(in_channels=in_channels)
+    with torch.no_grad():
+        new_conv_in = nn.Conv2d(
+            in_channels, out_channels, pipeline.unet.conv_in.kernel_size,
+            pipeline.unet.conv_in.stride, pipeline.unet.conv_in.padding
+        )
+        new_conv_in.weight.zero_()
+        new_conv_in.weight[:, :4, :, :].copy_(pipeline.unet.conv_in.weight)
+        pipeline.unet.conv_in = new_conv_in
+    # Load custom UNet
+    print('Loading custom UNet...')
+    unet_path = "best_21.ckpt"
+    state_dict = torch.load(unet_path, map_location='cpu')
+    # Process the state dict to match the model keys
+    if 'state_dict' in state_dict:
+        new_state_dict = {key.replace('unet.unet.', ''): value for key, value in state_dict['state_dict'].items()}
+        pipeline.unet.load_state_dict(new_state_dict, strict=False)
+    else:
+        pipeline.unet.load_state_dict(state_dict, strict=False)
+    pipeline = pipeline.to(device).to(torch_dtype=torch.float16)
+    # Load reconstruction model
+    print('Loading reconstruction model...')
+    model = instantiate_from_config(model_config)
+    model_path = hf_hub_download(
+        repo_id="TencentARC/InstantMesh",
+        filename="instant_nerf_large.ckpt",
+        repo_type="model"
+    )
+    state_dict = torch.load(model_path, map_location='cpu')['state_dict']
+    state_dict = {k[14:]: v for k, v in state_dict.items()
+                 if k.startswith('lrm_generator.') and 'source_camera' not in k}
+    model.load_state_dict(state_dict, strict=True)
+    model = model.to(device)
+    model.eval()
+    return pipeline, model, infer_config
+def process_images(input_images, prompt, steps=75, guidance_scale=7.5, pipeline=None):
+    """Process input images and run refinement"""
+    device = pipeline.device
+    if isinstance(input_images, list):
+        if len(input_images) == 1:
+            # Check if this is a pre-arranged layout
+            img = Image.open(input_images[0].name).convert('RGB')
+            if img.size == (640, 960):
+                # This is already a layout, use it directly
+                input_image = img
+            else:
+                # Single view - need 6 copies
+                img = img.resize((320, 320))
+                img_array = np.array(img) / 255.0
+                images = [img_array] * 6
+                images = np.stack(images)
+                # Convert to tensor and create layout
+                images = torch.from_numpy(images).float()
+                images = images.permute(0, 3, 1, 2)
+                images = images.reshape(3, 2, 3, 320, 320)
+                images = images.permute(0, 2, 3, 1, 4)
+                images = images.reshape(3, 3, 320, 640)
+                images = images.reshape(1, 3, 960, 640)
+                # Convert back to PIL
+                images = images.permute(0, 2, 3, 1)[0]
+                images = (images.numpy() * 255).astype(np.uint8)
+                input_image = Image.fromarray(images)
+        else:
+            # Multiple individual views
+            images = []
+            for img_file in input_images:
+                img = Image.open(img_file.name).convert('RGB')
+                img = img.resize((320, 320))
+                img = np.array(img) / 255.0
+                images.append(img)
+            # Pad to 6 images if needed
+            while len(images) < 6:
+                images.append(np.zeros_like(images[0]))
+            images = np.stack(images[:6])
+            # Convert to tensor and create layout
+            images = torch.from_numpy(images).float()
+            images = images.permute(0, 3, 1, 2)
+            images = images.reshape(3, 2, 3, 320, 320)
+            images = images.permute(0, 2, 3, 1, 4)
+            images = images.reshape(3, 3, 320, 640)
+            images = images.reshape(1, 3, 960, 640)
+            # Convert back to PIL
+            images = images.permute(0, 2, 3, 1)[0]
+            images = (images.numpy() * 255).astype(np.uint8)
+            input_image = Image.fromarray(images)
+    else:
+        raise ValueError("Expected a list of images")
+    # Generate refined output
+    output = pipeline.refine(
+        input_image,
+        prompt=prompt,
+        num_inference_steps=int(steps),
+        guidance_scale=guidance_scale
+    ).images[0]
+    return output, input_image
+def create_mesh(refined_image, model, infer_config):
+    """Generate mesh from refined image"""
+    # Convert PIL image to tensor
+    image = np.array(refined_image) / 255.0
+    image = torch.from_numpy(image).float().permute(2, 0, 1)
+    # Reshape to 6 views
+    image = image.reshape(3, 960, 640)
+    image = image.reshape(3, 3, 320, 640)
+    image = image.permute(1, 0, 2, 3)
+    image = image.reshape(3, 3, 320, 2, 320)
+    image = image.permute(0, 3, 1, 2, 4)
+    image = image.reshape(6, 3, 320, 320)
+    # Add batch dimension
+    image = image.unsqueeze(0)
+    input_cameras = get_zero123plus_input_cameras(batch_size=1, radius=4.0).to("cuda")
+    image = image.to("cuda")
+    with torch.no_grad():
+        planes = model.forward_planes(image, input_cameras)
+        mesh_out = model.extract_mesh(planes, **infer_config)
+        vertices, faces, vertex_colors = mesh_out
+    return vertices, faces, vertex_colors
+class ShapERenderer:
+    def __init__(self, device):
+        print("Loading Shap-E models...")
+        self.device = device
+        self.xm = load_model('transmitter', device=device)
+        self.model = load_model('text300M', device=device)
+        self.diffusion = diffusion_from_config(load_config('diffusion'))
+        print("Shap-E models loaded!")
+    def generate_views(self, prompt, guidance_scale=15.0, num_steps=64):
+        # Generate latents using the text-to-3D model
+        batch_size = 1
+        guidance_scale = float(guidance_scale)
+        latents = sample_latents(
+            batch_size=batch_size,
+            model=self.model,
+            diffusion=self.diffusion,
+            guidance_scale=guidance_scale,
+            model_kwargs=dict(texts=[prompt] * batch_size),
+            progress=True,
+            clip_denoised=True,
+            use_fp16=True,
+            use_karras=True,
+            karras_steps=num_steps,
+            sigma_min=1e-3,
+            sigma_max=160,
+            s_churn=0,
+        )
+        # Render the 6 views we need with specific viewing angles
+        size = 320  # Size of each rendered image
+        images = []
+        # Define our 6 specific camera positions to match refine.py
+        azimuths = [30, 90, 150, 210, 270, 330]
+        elevations = [20, -10, 20, -10, 20, -10]
+        for i, (azimuth, elevation) in enumerate(zip(azimuths, elevations)):
+            cameras = create_custom_cameras(size, self.device, azimuths=[azimuth], elevations=[elevation], fov_degrees=30, distance=3.0)
+            rendered_image = decode_latent_images(
+                self.xm,
+                latents[0],
+                rendering_mode='stf',
+                cameras=cameras
+            )
+            images.append(rendered_image.detach().cpu().numpy())
+        # Convert images to uint8
+        images = [(image).astype(np.uint8) for image in images]
+        # Create 2x3 grid layout (640x960) instead of 3x2 (960x640)
+        layout = np.zeros((960, 640, 3), dtype=np.uint8)
+        for i, img in enumerate(images):
+            row = i // 2  # Now 3 images per row
+            col = i % 2   # Now 3 images per row
+            layout[row*320:(row+1)*320, col*320:(col+1)*320] = img
+        return Image.fromarray(layout), images
+class RefinerInterface:
+    def __init__(self):
+        print("Initializing InstantMesh models...")
+        self.pipeline, self.model, self.infer_config = load_models()
+        print("InstantMesh models loaded!")
+    def refine_model(self, input_image, prompt, steps=75, guidance_scale=7.5):
+        """Main refinement function"""
+        # Process image and get refined output
+        input_image = Image.fromarray(input_image)
+        # Rotate the layout if needed (if we're getting a 640x960 layout but pipeline expects 960x640)
+        if input_image.width == 960 and input_image.height == 640:
+            # Transpose the image to get 960x640 layout
+            input_array = np.array(input_image)
+            new_layout = np.zeros((960, 640, 3), dtype=np.uint8)
+            # Rearrange from 2x3 to 3x2
+            for i in range(6):
+                src_row = i // 3
+                src_col = i % 3
+                dst_row = i // 2
+                dst_col = i % 2
+                new_layout[dst_row*320:(dst_row+1)*320, dst_col*320:(dst_col+1)*320] = \
+                    input_array[src_row*320:(src_row+1)*320, src_col*320:(src_col+1)*320]
+            input_image = Image.fromarray(new_layout)
+        # Process with the pipeline (expects 960x640)
+        refined_output_960x640 = self.pipeline.refine(
+            input_image,
+            prompt=prompt,
+            num_inference_steps=int(steps),
+            guidance_scale=guidance_scale
+        ).images[0]
+        # Generate mesh using the 960x640 format
+        vertices, faces, vertex_colors = create_mesh(
+            refined_output_960x640,
+            self.model,
+            self.infer_config
+        )
+        # Save temporary mesh file
+        os.makedirs("temp", exist_ok=True)
+        temp_obj = os.path.join("temp", "refined_mesh.obj")
+        save_obj(vertices, faces, vertex_colors, temp_obj)
+        # Convert the output to 640x960 for display
+        refined_array = np.array(refined_output_960x640)
+        display_layout = np.zeros((960, 640, 3), dtype=np.uint8)
+        # Rearrange from 3x2 to 2x3
+        for i in range(6):
+            src_row = i // 2
+            src_col = i % 2
+            dst_row = i // 2
+            dst_col = i % 2
+            display_layout[dst_row*320:(dst_row+1)*320, dst_col*320:(dst_col+1)*320] = \
+                refined_array[src_row*320:(src_row+1)*320, src_col*320:(src_col+1)*320]
+        refined_output_640x960 = Image.fromarray(display_layout)
+        return refined_output_640x960, temp_obj
+def create_demo():
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    shap_e = ShapERenderer(device)
+    refiner = RefinerInterface()
+    with gr.Blocks() as demo:
+        gr.Markdown("# Shap-E to InstantMesh Pipeline")
+        # First row: Controls
+        with gr.Row():
+            with gr.Column():
+                # Shap-E inputs
+                shape_prompt = gr.Textbox(
+                    label="Shap-E Prompt",
+                    placeholder="Enter text to generate initial 3D model..."
+                )
+                shape_guidance = gr.Slider(
+                    minimum=1,
+                    maximum=30,
+                    value=15.0,
+                    label="Shap-E Guidance Scale"
+                )
+                shape_steps = gr.Slider(
+                    minimum=16,
+                    maximum=128,
+                    value=64,
+                    step=16,
+                    label="Shap-E Steps"
+                )
+                generate_btn = gr.Button("Generate Views")
+            with gr.Column():
+                # Refinement inputs
+                refine_prompt = gr.Textbox(
+                    label="Refinement Prompt",
+                    placeholder="Enter prompt to guide refinement..."
+                )
+                refine_steps = gr.Slider(
+                    minimum=30,
+                    maximum=100,
+                    value=75,
+                    step=1,
+                    label="Refinement Steps"
+                )
+                refine_guidance = gr.Slider(
+                    minimum=1,
+                    maximum=20,
+                    value=7.5,
+                    label="Refinement Guidance Scale"
+                )
+                refine_btn = gr.Button("Refine")
+        # Second row: Image panels side by side
+        with gr.Row():
+            # Outputs - Images side by side
+            shape_output = gr.Image(
+                label="Generated Views",
+                width=640,  # Swapped dimensions
+                height=960   # Swapped dimensions
+            )
+            refined_output = gr.Image(
+                label="Refined Output",
+                width=640,  # Swapped dimensions
+                height=960   # Swapped dimensions
+            )
+        # Third row: 3D mesh panel below
+        with gr.Row():
+            # 3D mesh centered
+            mesh_output = gr.Model3D(
+                label="3D Mesh",
+                clear_color=[1.0, 1.0, 1.0, 1.0],
+                width=1280,  # Full width
+                height=600   # Taller for better visualization
+            )
+        # Set up event handlers
+        def generate(prompt, guidance_scale, num_steps):
+            with torch.no_grad():
+                layout, _ = shap_e.generate_views(prompt, guidance_scale, num_steps)
+            return layout
+        def refine(input_image, prompt, steps, guidance_scale):
+            refined_img, mesh_path = refiner.refine_model(
+                input_image,
+                prompt,
+                steps,
+                guidance_scale
+            )
+            return refined_img, mesh_path
+        generate_btn.click(
+            fn=generate,
+            inputs=[shape_prompt, shape_guidance, shape_steps],
+            outputs=[shape_output]
+        )
+        refine_btn.click(
+            fn=refine,
+            inputs=[shape_output, refine_prompt, refine_steps, refine_guidance],
+            outputs=[refined_output, mesh_output]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch(share=True)

app2.py ADDED Viewed

	@@ -0,0 +1,419 @@

+import os
+import torch
+import torch.nn as nn
+import gradio as gr
+import numpy as np
+from PIL import Image
+from omegaconf import OmegaConf
+from pytorch_lightning import seed_everything
+from huggingface_hub import hf_hub_download
+""||||||||||||||||||||"from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler
+from einops import rearrange
+from shap_e.diffusion.sample import sample_latents
+from shap_e.diffusion.gaussian_diffusion import diffusion_from_config
+from shap_e.models.download import load_model, load_config
+from shap_e.util.notebooks import create_pan_cameras, decode_latent_images, create_custom_cameras
+from src.utils.train_util import instantiate_from_config
+from src.utils.camera_util import (
+    FOV_to_intrinsics,
+    get_zero123plus_input_cameras,
+    get_circular_camera_poses,
+    spherical_camera_pose
+)
+from src.utils.mesh_util import save_obj, save_glb
+from src.utils.infer_util import remove_background, resize_foreground
+def load_models():
+    """Initialize and load all required models"""
+    config = OmegaConf.load('configs/instant-nerf-large-best.yaml')
+    model_config = config.model_config
+    infer_config = config.infer_config
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    # Load diffusion pipeline
+    print('Loading diffusion pipeline...')
+    pipeline = DiffusionPipeline.from_pretrained(
+        "sudo-ai/zero123plus-v1.2",
+        custom_pipeline="zero123plus",
+        torch_dtype=torch.float16
+    )
+    pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
+        pipeline.scheduler.config, timestep_spacing='trailing'
+    )
+    # Modify UNet to handle 8 input channels instead of 4
+    in_channels = 8
+    out_channels = pipeline.unet.conv_in.out_channels
+    pipeline.unet.register_to_config(in_channels=in_channels)
+    with torch.no_grad():
+        new_conv_in = nn.Conv2d(
+            in_channels, out_channels, pipeline.unet.conv_in.kernel_size,
+            pipeline.unet.conv_in.stride, pipeline.unet.conv_in.padding
+        )
+        new_conv_in.weight.zero_()
+        new_conv_in.weight[:, :4, :, :].copy_(pipeline.unet.conv_in.weight)
+        pipeline.unet.conv_in = new_conv_in
+    # Load custom UNet
+    print('Loading custom UNet...')
+    unet_path = "best_21.ckpt"
+    state_dict = torch.load(unet_path, map_location='cpu')
+    # Process the state dict to match the model keys
+    if 'state_dict' in state_dict:
+        new_state_dict = {key.replace('unet.unet.', ''): value for key, value in state_dict['state_dict'].items()}
+        pipeline.unet.load_state_dict(new_state_dict, strict=False)
+    else:
+        pipeline.unet.load_state_dict(state_dict, strict=False)
+    pipeline = pipeline.to(device).to(torch_dtype=torch.float16)
+    # Load reconstruction model
+    print('Loading reconstruction model...')
+    model = instantiate_from_config(model_config)
+    model_path = hf_hub_download(
+        repo_id="TencentARC/InstantMesh",
+        filename="instant_nerf_large.ckpt",
+        repo_type="model"
+    )
+    state_dict = torch.load(model_path, map_location='cpu')['state_dict']
+    state_dict = {k[14:]: v for k, v in state_dict.items()
+                 if k.startswith('lrm_generator.') and 'source_camera' not in k}
+    model.load_state_dict(state_dict, strict=True)
+    model = model.to(device)
+    model.eval()
+    return pipeline, model, infer_config
+def process_images(input_images, prompt, steps=75, guidance_scale=7.5, pipeline=None):
+    """Process input images and run refinement"""
+    device = pipeline.device
+    if isinstance(input_images, list):
+        if len(input_images) == 1:
+            # Check if this is a pre-arranged layout
+            img = Image.open(input_images[0].name).convert('RGB')
+            if img.size == (640, 960):
+                # This is already a layout, use it directly
+                input_image = img
+            else:
+                # Single view - need 6 copies
+                img = img.resize((320, 320))
+                img_array = np.array(img) / 255.0
+                images = [img_array] * 6
+                images = np.stack(images)
+                # Convert to tensor and create layout
+                images = torch.from_numpy(images).float()
+                images = images.permute(0, 3, 1, 2)
+                images = images.reshape(3, 2, 3, 320, 320)
+                images = images.permute(0, 2, 3, 1, 4)
+                images = images.reshape(3, 3, 320, 640)
+                images = images.reshape(1, 3, 960, 640)
+                # Convert back to PIL
+                images = images.permute(0, 2, 3, 1)[0]
+                images = (images.numpy() * 255).astype(np.uint8)
+                input_image = Image.fromarray(images)
+        else:
+            # Multiple individual views
+            images = []
+            for img_file in input_images:
+                img = Image.open(img_file.name).convert('RGB')
+                img = img.resize((320, 320))
+                img = np.array(img) / 255.0
+                images.append(img)
+            # Pad to 6 images if needed
+            while len(images) < 6:
+                images.append(np.zeros_like(images[0]))
+            images = np.stack(images[:6])
+            # Convert to tensor and create layout
+            images = torch.from_numpy(images).float()
+            images = images.permute(0, 3, 1, 2)
+            images = images.reshape(3, 2, 3, 320, 320)
+            images = images.permute(0, 2, 3, 1, 4)
+            images = images.reshape(3, 3, 320, 640)
+            images = images.reshape(1, 3, 960, 640)
+            # Convert back to PIL
+            images = images.permute(0, 2, 3, 1)[0]
+            images = (images.numpy() * 255).astype(np.uint8)
+            input_image = Image.fromarray(images)
+    else:
+        raise ValueError("Expected a list of images")
+    # Generate refined output
+    output = pipeline.refine(
+        input_image,
+        prompt=prompt,
+        num_inference_steps=int(steps),
+        guidance_scale=guidance_scale
+    ).images[0]
+    return output, input_image
+def create_mesh(refined_image, model, infer_config):
+    """Generate mesh from refined image"""
+    # Convert PIL image to tensor
+    image = np.array(refined_image) / 255.0
+    image = torch.from_numpy(image).float().permute(2, 0, 1)
+    # Reshape to 6 views
+    image = image.reshape(3, 960, 640)
+    image = image.reshape(3, 3, 320, 640)
+    image = image.permute(1, 0, 2, 3)
+    image = image.reshape(3, 3, 320, 2, 320)
+    image = image.permute(0, 3, 1, 2, 4)
+    image = image.reshape(6, 3, 320, 320)
+    # Add batch dimension
+    image = image.unsqueeze(0)
+    input_cameras = get_zero123plus_input_cameras(batch_size=1, radius=4.0).to("cuda")
+    image = image.to("cuda")
+    with torch.no_grad():
+        planes = model.forward_planes(image, input_cameras)
+        mesh_out = model.extract_mesh(planes, **infer_config)
+        vertices, faces, vertex_colors = mesh_out
+    return vertices, faces, vertex_colors
+class ShapERenderer:
+    def __init__(self, device):
+        print("Loading Shap-E models...")
+        self.device = device
+        self.xm = load_model('transmitter', device=device)
+        self.model = load_model('text300M', device=device)
+        self.diffusion = diffusion_from_config(load_config('diffusion'))
+        print("Shap-E models loaded!")
+    def generate_views(self, prompt, guidance_scale=15.0, num_steps=64):
+        # Generate latents using the text-to-3D model
+        batch_size = 1
+        guidance_scale = float(guidance_scale)
+        latents = sample_latents(
+            batch_size=batch_size,
+            model=self.model,
+            diffusion=self.diffusion,
+            guidance_scale=guidance_scale,
+            model_kwargs=dict(texts=[prompt] * batch_size),
+            progress=True,
+            clip_denoised=True,
+            use_fp16=True,
+            use_karras=True,
+            karras_steps=num_steps,
+            sigma_min=1e-3,
+            sigma_max=160,
+            s_churn=0,
+        )
+        # Render the 6 views we need with specific viewing angles
+        size = 320  # Size of each rendered image
+        images = []
+        # Define our 6 specific camera positions to match refine.py
+        azimuths = [30, 90, 150, 210, 270, 330]
+        elevations = [20, -10, 20, -10, 20, -10]
+        for i, (azimuth, elevation) in enumerate(zip(azimuths, elevations)):
+            cameras = create_custom_cameras(size, self.device, azimuths=[azimuth], elevations=[elevation], fov_degrees=30, distance=3.0)
+            rendered_image = decode_latent_images(
+                self.xm,
+                latents[0],
+                rendering_mode='stf',
+                cameras=cameras
+            )
+            images.append(rendered_image.detach().cpu().numpy())
+        # Convert images to uint8
+        images = [(image).astype(np.uint8) for image in images]
+        # Create 2x3 grid layout (640x960) instead of 3x2 (960x640)
+        layout = np.zeros((960, 640, 3), dtype=np.uint8)
+        for i, img in enumerate(images):
+            row = i // 2  # Now 3 images per row
+            col = i % 2   # Now 3 images per row
+            layout[row*320:(row+1)*320, col*320:(col+1)*320] = img
+        return Image.fromarray(layout), images
+class RefinerInterface:
+    def __init__(self):
+        print("Initializing InstantMesh models...")
+        self.pipeline, self.model, self.infer_config = load_models()
+        print("InstantMesh models loaded!")
+    def refine_model(self, input_image, prompt, steps=75, guidance_scale=7.5):
+        """Main refinement function"""
+        # Process image and get refined output
+        input_image = Image.fromarray(input_image)
+        # Rotate the layout if needed (if we're getting a 640x960 layout but pipeline expects 960x640)
+        if input_image.width == 960 and input_image.height == 640:
+            # Transpose the image to get 960x640 layout
+            input_array = np.array(input_image)
+            new_layout = np.zeros((960, 640, 3), dtype=np.uint8)
+            # Rearrange from 2x3 to 3x2
+            for i in range(6):
+                src_row = i // 3
+                src_col = i % 3
+                dst_row = i // 2
+                dst_col = i % 2
+                new_layout[dst_row*320:(dst_row+1)*320, dst_col*320:(dst_col+1)*320] = \
+                    input_array[src_row*320:(src_row+1)*320, src_col*320:(src_col+1)*320]
+            input_image = Image.fromarray(new_layout)
+        # Process with the pipeline (expects 960x640)
+        refined_output_960x640 = self.pipeline.refine(
+            input_image,
+            prompt=prompt,
+            num_inference_steps=int(steps),
+            guidance_scale=guidance_scale
+        ).images[0]
+        # Generate mesh using the 960x640 format
+        vertices, faces, vertex_colors = create_mesh(
+            refined_output_960x640,
+            self.model,
+            self.infer_config
+        )
+        # Save temporary mesh file
+        os.makedirs("temp", exist_ok=True)
+        temp_obj = os.path.join("temp", "refined_mesh.obj")
+        save_obj(vertices, faces, vertex_colors, temp_obj)
+        # Convert the output to 640x960 for display
+        refined_array = np.array(refined_output_960x640)
+        display_layout = np.zeros((960, 640, 3), dtype=np.uint8)
+        # Rearrange from 3x2 to 2x3
+        for i in range(6):
+            src_row = i // 2
+            src_col = i % 2
+            dst_row = i // 2
+            dst_col = i % 2
+            display_layout[dst_row*320:(dst_row+1)*320, dst_col*320:(dst_col+1)*320] = \
+                refined_array[src_row*320:(src_row+1)*320, src_col*320:(src_col+1)*320]
+        refined_output_640x960 = Image.fromarray(display_layout)
+        return refined_output_640x960, temp_obj
+def create_demo():
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    shap_e = ShapERenderer(device)
+    refiner = RefinerInterface()
+    with gr.Blocks() as demo:
+        gr.Markdown("# Shap-E to InstantMesh Pipeline")
+        # First row: Controls
+        with gr.Row():
+            with gr.Column():
+                # Shap-E inputs
+                shape_prompt = gr.Textbox(
+                    label="Shap-E Prompt",
+                    placeholder="Enter text to generate initial 3D model..."
+                )
+                shape_guidance = gr.Slider(
+                    minimum=1,
+                    maximum=30,
+                    value=15.0,
+                    label="Shap-E Guidance Scale"
+                )
+                shape_steps = gr.Slider(
+                    minimum=16,
+                    maximum=128,
+                    value=64,
+                    step=16,
+                    label="Shap-E Steps"
+                )
+                generate_btn = gr.Button("Generate Views")
+            with gr.Column():
+                # Refinement inputs
+                refine_prompt = gr.Textbox(
+                    label="Refinement Prompt",
+                    placeholder="Enter prompt to guide refinement..."
+                )
+                refine_steps = gr.Slider(
+                    minimum=30,
+                    maximum=100,
+                    value=75,
+                    step=1,
+                    label="Refinement Steps"
+                )
+                refine_guidance = gr.Slider(
+                    minimum=1,
+                    maximum=20,
+                    value=7.5,
+                    label="Refinement Guidance Scale"
+                )
+                refine_btn = gr.Button("Refine")
+        # Second row: Image panels side by side
+        with gr.Row():
+            # Outputs - Images side by side
+            shape_output = gr.Image(
+                label="Generated Views",
+                width=640,  # Swapped dimensions
+                height=960   # Swapped dimensions
+            )
+            refined_output = gr.Image(
+                label="Refined Output",
+                width=640,  # Swapped dimensions
+                height=960   # Swapped dimensions
+            )
+        # Third row: 3D mesh panel below
+        with gr.Row():
+            # 3D mesh centered
+            mesh_output = gr.Model3D(
+                label="3D Mesh",
+                clear_color=[1.0, 1.0, 1.0, 1.0],
+                width=1280,  # Full width
+                height=600   # Taller for better visualization
+            )
+        # Set up event handlers
+        def generate(prompt, guidance_scale, num_steps):
+            with torch.no_grad():
+                layout, _ = shap_e.generate_views(prompt, guidance_scale, num_steps)
+            return layout
+        def refine(input_image, prompt, steps, guidance_scale):
+            refined_img, mesh_path = refiner.refine_model(
+                input_image,
+                prompt,
+                steps,
+                guidance_scale
+            )
+            return refined_img, mesh_path
+        generate_btn.click(
+            fn=generate,
+            inputs=[shape_prompt, shape_guidance, shape_steps],
+            outputs=[shape_output]
+        )
+        refine_btn.click(
+            fn=refine,
+            inputs=[shape_output, refine_prompt, refine_steps, refine_guidance],
+            outputs=[refined_output, mesh_output]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch(share=True)