import os import gradio as gr import torch import numpy as np from PIL import Image import trimesh from diffusers import Zero123Pipeline import tempfile # Check if CUDA is available, otherwise use CPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # Initialize the pipeline pipe = Zero123Pipeline.from_pretrained( "bennyguo/zero123-xl-diffusers", torch_dtype=torch.float16 if device.type == "cuda" else torch.float32, ).to(device) def image_to_3d(input_image, num_inference_steps=75, guidance_scale=3.0): """ Convert a single image to a 3D model """ # Preprocess image if input_image is None: return None input_image = input_image.convert("RGB").resize((256, 256)) # Generate multiple views using Zero123 images = [] # Generate views from different angles for elevation in [0, 30]: for azimuth in [0, 90, 180, 270]: print(f"Generating view: elevation={elevation}, azimuth={azimuth}") with torch.no_grad(): image = pipe( image=input_image, elevation=elevation, azimuth=azimuth, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, ).images[0] images.append(np.array(image)) # Create point cloud from multiple views # This is a simplified approach - in production you might want to use a more sophisticated method points = [] for i, img in enumerate(images): # Extract depth information (simplified approach) gray = np.mean(img, axis=2) # Sample points from the image h, w = gray.shape for y in range(0, h, 4): for x in range(0, w, 4): depth = gray[y, x] / 255.0 # Normalize depth # Convert to 3D point based on view angle angle_idx = i % 4 elevation = 0 if i < 4 else 30 azimuth = angle_idx * 90 # Convert to radians elevation_rad = elevation * np.pi / 180 azimuth_rad = azimuth * np.pi / 180 # Calculate 3D position based on spherical coordinates z = depth * np.cos(elevation_rad) * np.cos(azimuth_rad) x = depth * np.cos(elevation_rad) * np.sin(azimuth_rad) y = depth * np.sin(elevation_rad) points.append([x, y, z]) # Create a point cloud point_cloud = np.array(points) # Save point cloud to OBJ file with tempfile.NamedTemporaryFile(suffix='.obj', delete=False) as tmp_file: mesh = trimesh.points.PointCloud(point_cloud) mesh.export(tmp_file.name) # Also export as PLY for better compatibility ply_path = tmp_file.name.replace('.obj', '.ply') mesh.export(ply_path) return [tmp_file.name, ply_path] def process_image(image, num_steps, guidance): try: model_paths = image_to_3d(image, num_inference_steps=num_steps, guidance_scale=guidance) if model_paths: return model_paths[0], model_paths[1], "3D model generated successfully!" else: return None, None, "Failed to process the image." except Exception as e: return None, None, f"Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="Image to 3D Model Converter") as demo: gr.Markdown("# Image to 3D Model Converter") gr.Markdown("Upload an image to convert it to a 3D model that you can use in Unity or other engines.") with gr.Row(): with gr.Column(scale=1): input_image = gr.Image(type="pil", label="Input Image") num_steps = gr.Slider(minimum=20, maximum=100, value=75, step=5, label="Number of Inference Steps") guidance = gr.Slider(minimum=1.0, maximum=7.0, value=3.0, step=0.5, label="Guidance Scale") submit_btn = gr.Button("Convert to 3D") with gr.Column(scale=1): obj_file = gr.File(label="OBJ File") ply_file = gr.File(label="PLY File") output_message = gr.Textbox(label="Output Message") submit_btn.click( fn=process_image, inputs=[input_image, num_steps, guidance], outputs=[obj_file, ply_file, output_message] ) # Launch the app if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)