Spaces:

VAST-AI
/

TripoSG-scribble

Running on Zero

App Files Files Community

bennyguo commited on Apr 17

Commit

af15ec4

1 Parent(s): b98ab62

add wd14 tagging if prompt is not given

Browse files

Files changed (2) hide show

app.py +222 -40
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,9 +1,23 @@
 import gradio as gr
 import os
 import sys
 import subprocess
-from huggingface_hub import snapshot_download, HfFolder
 import random # Import random for seed generation
 # --- Repo Setup ---
 DEFAULT_REPO_DIR = "./TripoSG-repo" # Directory to clone into if not using local path
@@ -152,66 +166,234 @@ MAX_SEED = np.iinfo(np.int32).max
 def get_random_seed():
     return random.randint(0, MAX_SEED)
 # Apply decorator conditionally
 @spaces.GPU() if ENABLE_ZEROGPU else lambda func: func
-def generate_3d(scribble_image_dict, prompt, scribble_confidence, prompt_confidence, seed): # Added text_confidence parameter
     print("Generating 3D model...")
-    # Extract the composite image from the ImageEditor dictionary
     if scribble_image_dict is None or scribble_image_dict.get("composite") is None:
         print("No scribble image provided.")
-        return None # Return None if no image is provided
     # --- Seed Handling ---
     current_seed = int(seed)
     print(f"Using seed: {current_seed}")
     # --- End Seed Handling ---
-    # Get the composite image which includes the drawing
     # The composite might be RGBA if a layer was involved, ensure RGB for processing
-    image = Image.fromarray(scribble_image_dict["composite"]).convert("RGB")
     # Preprocess the image: invert colors (black on white -> white on black)
-    image_np = np.array(image)
     processed_image_np = 255 - image_np
     processed_image = Image.fromarray(processed_image_np)
-    print("Image preprocessed.")
-    # Define fixed parameters
-    # attn_scale_text = 1.0 # Replaced by text_confidence input
-    # Set the generator with the provided seed
     generator = torch.Generator(device='cuda').manual_seed(current_seed)
-    # Run the pipeline
     print("Running pipeline...")
-    out = pipe(
-        processed_image,
-        prompt=prompt,
-        num_tokens=512, # Default value from example
-        guidance_scale=0, # Default value from example
-        num_inference_steps=16, # Default value from example
-        attention_kwargs={
-            "cross_attention_scale": prompt_confidence, # Use input parameter
-            "cross_attention_2_scale": scribble_confidence
-        },
-        generator=generator,
-        use_flash_decoder=False,
-        dense_octree_depth=8,
-        hierarchical_octree_depth=8
-    )
-    print("Pipeline finished.")
-    # Save the output mesh to a temporary file
     if out.meshes and len(out.meshes) > 0:
         # Create a temporary file with .glb extension
         with tempfile.NamedTemporaryFile(suffix=".glb", delete=False) as tmpfile:
             output_path = tmpfile.name
         out.meshes[0].export(output_path)
         print(f"Mesh saved to temporary file: {output_path}")
-        return output_path
     else:
         print("Pipeline did not generate any meshes.")
-        return None
 # Create the Gradio interface
 with gr.Blocks() as demo:
@@ -242,21 +424,21 @@ with gr.Blocks() as demo:
     submit_button.click(
         fn=generate_3d,
-        inputs=gen_inputs, # Include seed_input and text_confidence_input
-        outputs=model_output
     )
     # Define inputs for the lucky button (same as main button for the final call)
     lucky_gen_inputs = [image_input, prompt_input, confidence_input, prompt_confidence_input, seed_input] # Added text_confidence_input
     lucky_button.click(
-        fn=get_random_seed, # First, get a random seed
         inputs=[],
-        outputs=[seed_input] # Update the seed input field
     ).then(
-        fn=generate_3d, # Then, generate the model
-        inputs=lucky_gen_inputs, # Use the updated seed from the input field
-        outputs=model_output
     )
 # Launch with queue enabled if using ZeroGPU

+# --- Environment Variables Used ---
+# ENABLE_ZEROGPU: Set to 'true' or '1' to enable @spaces.GPU decorator (for Hugging Face Spaces).
+# TRIPOSG_CODE_PATH: Absolute path to a local directory containing the checked-out TripoSG repository (scribble branch).
+# GITHUB_TOKEN: A GitHub token used for cloning the TripoSG repo if TRIPOSG_CODE_PATH is not provided.
+# WEIGHTS_PATH: Absolute path to a local directory containing the TripoSG-scribble model weights.
+# HF_TOKEN: A Hugging Face Hub token used for downloading weights/models if local paths (WEIGHTS_PATH, WD14_CONVNEXT_PATH) are not provided.
+# WD14_CONVNEXT_PATH: Absolute path to a local directory containing the WD14 ConvNeXT tagger model.onnx and selected_tags.csv.
+# ----------------------------------
 import gradio as gr
 import os
 import sys
 import subprocess
+from huggingface_hub import snapshot_download, HfFolder, hf_hub_download
 import random # Import random for seed generation
+import re # For WD14 tag processing
+import cv2 # For WD14 preprocessing
+import pandas as pd # For WD14 tags
+from onnxruntime import InferenceSession # For WD14 model
+from typing import Mapping, Tuple, Dict # Type hints
 # --- Repo Setup ---
 DEFAULT_REPO_DIR = "./TripoSG-repo" # Directory to clone into if not using local path
 def get_random_seed():
     return random.randint(0, MAX_SEED)
+# --- WD14 Helper Functions ---
+def make_square(img, target_size):
+    old_size = img.shape[:2]
+    desired_size = max(old_size)
+    desired_size = max(desired_size, target_size)
+    delta_w = desired_size - old_size[1]
+    delta_h = desired_size - old_size[0]
+    top, bottom = delta_h // 2, delta_h - (delta_h // 2)
+    left, right = delta_w // 2, delta_w - (delta_w // 2)
+    color = [255, 255, 255] # White padding
+    return cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
+def smart_resize(img, size):
+    if img.shape[0] > size:
+        img = cv2.resize(img, (size, size), interpolation=cv2.INTER_AREA)
+    elif img.shape[0] < size:
+        img = cv2.resize(img, (size, size), interpolation=cv2.INTER_CUBIC)
+    return img
+RE_SPECIAL = re.compile(r'([\()])')
+# --- WD14 Tagger Class ---
+class WaifuDiffusionInterrogator:
+    def __init__(
+            self,
+            repo: str,
+            model_filename='model.onnx',
+            tags_filename='selected_tags.csv',
+            local_model_dir: str | None = None # Added local path option
+    ) -> None:
+        self.__repo = repo
+        self.__model_filename = model_filename
+        self.__tags_filename = tags_filename
+        self.__local_model_dir = local_model_dir
+        self.__initialized = False
+        self._model = None
+        self._tags = None
+    def _init(self) -> None:
+        if self.__initialized:
+            return
+        model_path = None
+        tags_path = None
+        if self.__local_model_dir:
+            print(f"WD14: Attempting to load from local directory: {self.__local_model_dir}")
+            potential_model_path = os.path.join(self.__local_model_dir, self.__model_filename)
+            potential_tags_path = os.path.join(self.__local_model_dir, self.__tags_filename)
+            if os.path.exists(potential_model_path) and os.path.exists(potential_tags_path):
+                model_path = potential_model_path
+                tags_path = potential_tags_path
+                print("WD14: Found local model and tags file.")
+            else:
+                print("WD14: Local files not found. Falling back to Hugging Face download.")
+        if model_path is None or tags_path is None:
+            print(f"WD14: Downloading from repo: {self.__repo}")
+            hf_token = os.environ.get("HF_TOKEN") # Reuse HF token if available
+            try:
+                model_path = hf_hub_download(self.__repo, filename=self.__model_filename, token=hf_token)
+                tags_path = hf_hub_download(self.__repo, filename=self.__tags_filename, token=hf_token)
+                print("WD14: Download complete.")
+            except Exception as e:
+                print(f"WD14: Error downloading from Hugging Face: {e}")
+                # Decide how to handle this - maybe raise error or disable tagging?
+                # For now, we'll let it fail later if model is None
+                return # Cannot initialize
+        try:
+            self._model = InferenceSession(str(model_path))
+            self._tags = pd.read_csv(tags_path)
+            self.__initialized = True
+            print("WD14: Tagger initialized successfully.")
+        except Exception as e:
+            print(f"WD14: Error initializing ONNX session or reading tags: {e}")
+    def _calculation(self, image: Image.Image) -> pd.DataFrame | None:
+        self._init()
+        if not self._model or self._tags is None:
+             print("WD14: Tagger not initialized.")
+             return None
+        _, height, _, _ = self._model.get_inputs()[0].shape
+        image = image.convert('RGBA')
+        new_image = Image.new('RGBA', image.size, 'WHITE')
+        new_image.paste(image, mask=image)
+        image = new_image.convert('RGB')
+        image.save("image_to_wd.png")
+        image = np.asarray(image)
+        image = image[:, :, ::-1]
+        image = make_square(image, height)
+        image = smart_resize(image, height)
+        image = image.astype(np.float32)
+        image = np.expand_dims(image, 0)
+        input_name = self._model.get_inputs()[0].name
+        label_name = self._model.get_outputs()[0].name
+        confidence = self._model.run([label_name], {input_name: image})[0]
+        full_tags = self._tags[['name', 'category']].copy()
+        full_tags['confidence'] = confidence[0]
+        return full_tags
+    def interrogate(self, image: Image.Image) -> Tuple[Dict[str, float], Dict[str, float]] | None:
+        full_tags = self._calculation(image)
+        if full_tags is None:
+            return None
+        ratings = dict(full_tags[full_tags['category'] == 9][['name', 'confidence']].values)
+        tags = dict(full_tags[full_tags['category'] != 9][['name', 'confidence']].values)
+        return ratings, tags
+# --- Instantiate WD14 Tagger ---
+WD14_CONVNEXT_REPO = 'SmilingWolf/wd-v1-4-convnext-tagger'
+wd14_local_path = os.environ.get("WD14_CONVNEXT_PATH")
+wd14_tagger = WaifuDiffusionInterrogator(repo=WD14_CONVNEXT_REPO, local_model_dir=wd14_local_path)
+# --- Helper to format tags ---
+def format_wd14_tags(tags: Dict[str, float], threshold: float = 0.35) -> str:
+    filtered_tags = {
+        tag: score for tag, score in tags.items()
+        if score >= threshold and "background" not in tag and tag not in {"monochrome", "greyscale", "no_humans", "comic", "solo"}
+    }
+    print(filtered_tags)
+    # Sort by score descending, then alphabetically
+    tags_pairs = sorted(filtered_tags.items(), key=lambda x: (-x[1], x[0]))
+    text_items = [tag.replace('_', ' ') for tag, score in tags_pairs]
+    return ', '.join(text_items)
 # Apply decorator conditionally
 @spaces.GPU() if ENABLE_ZEROGPU else lambda func: func
+def generate_3d(scribble_image_dict, prompt, scribble_confidence, text_confidence, seed):
     print("Generating 3D model...")
+    input_prompt = prompt # Keep track of original prompt for return on early exit
     if scribble_image_dict is None or scribble_image_dict.get("composite") is None:
         print("No scribble image provided.")
+        return None, input_prompt # Return None for model, original prompt
+    # --- Prompt Handling ---
+    input_prompt = prompt.strip()
+    if not input_prompt:
+        print("Prompt is empty, attempting WD14 tagging...")
+        try:
+            # Get the user drawing (black on white) for tagging
+            user_drawing_img = Image.fromarray(scribble_image_dict["composite"]).convert("RGB")
+            tag_results = wd14_tagger.interrogate(user_drawing_img)
+            if tag_results:
+                ratings, tags = tag_results
+                generated_prompt = format_wd14_tags(tags) # Use default threshold
+                if generated_prompt:
+                    print(f"WD14 generated prompt: {generated_prompt}")
+                    input_prompt = generated_prompt
+                else:
+                    print("WD14 tagging did not produce tags above threshold.")
+                    input_prompt = "object" # Fallback prompt
+            else:
+                print("WD14 tagging failed or tagger not initialized.")
+                input_prompt = "object" # Fallback prompt
+        except Exception as e:
+            print(f"Error during WD14 tagging: {e}")
+            input_prompt = "object" # Fallback prompt
+    else:
+        print(f"Using user provided prompt: {input_prompt}")
+    # --- End Prompt Handling ---
     # --- Seed Handling ---
     current_seed = int(seed)
     print(f"Using seed: {current_seed}")
     # --- End Seed Handling ---
+    # --- Image Preprocessing for TripoSG ---
+    # Get the composite image again (safer in case dict is modified)
     # The composite might be RGBA if a layer was involved, ensure RGB for processing
+    image_for_triposg = Image.fromarray(scribble_image_dict["composite"]).convert("RGB")
     # Preprocess the image: invert colors (black on white -> white on black)
+    image_np = np.array(image_for_triposg)
     processed_image_np = 255 - image_np
     processed_image = Image.fromarray(processed_image_np)
+    print("Image preprocessed for TripoSG.")
+    # --- End Image Preprocessing ---
+    # --- Generator Setup ---
     generator = torch.Generator(device='cuda').manual_seed(current_seed)
+    # --- End Generator Setup ---
+    # --- Run Pipeline ---
     print("Running pipeline...")
+    try:
+        out = pipe(
+            processed_image,
+            prompt=input_prompt, # Use the potentially generated prompt
+            num_tokens=512, # Default value from example
+            guidance_scale=0, # Default value from example
+            num_inference_steps=16, # Default value from example
+            attention_kwargs={
+                "cross_attention_scale": text_confidence,
+                "cross_attention_2_scale": scribble_confidence
+            },
+            generator=generator,
+            use_flash_decoder=False, # Default value from example
+            dense_octree_depth=8, # Default value from example
+            hierarchical_octree_depth=8 # Default value from example
+        )
+        print("Pipeline finished.")
+    except Exception as e:
+        print(f"Error during pipeline execution: {e}")
+        return None, input_prompt # Return None for model, the prompt used
+    # --- End Run Pipeline ---
+    # --- Save Output ---
     if out.meshes and len(out.meshes) > 0:
         # Create a temporary file with .glb extension
         with tempfile.NamedTemporaryFile(suffix=".glb", delete=False) as tmpfile:
             output_path = tmpfile.name
         out.meshes[0].export(output_path)
         print(f"Mesh saved to temporary file: {output_path}")
+        return output_path, input_prompt # Return model path and the prompt used
     else:
         print("Pipeline did not generate any meshes.")
+        return None, input_prompt # Return None for model, the prompt used
+    # --- End Save Output ---
 # Create the Gradio interface
 with gr.Blocks() as demo:
     submit_button.click(
         fn=generate_3d,
+        inputs=gen_inputs,
+        outputs=[model_output, prompt_input] # Add prompt_input to outputs
     )
     # Define inputs for the lucky button (same as main button for the final call)
     lucky_gen_inputs = [image_input, prompt_input, confidence_input, prompt_confidence_input, seed_input] # Added text_confidence_input
     lucky_button.click(
+        fn=get_random_seed,
         inputs=[],
+        outputs=[seed_input]
     ).then(
+        fn=generate_3d,
+        inputs=lucky_gen_inputs,
+        outputs=[model_output, prompt_input] # Add prompt_input to outputs
     )
 # Launch with queue enabled if using ZeroGPU

requirements.txt CHANGED Viewed

@@ -14,3 +14,4 @@ typeguard
 ninja
 gltflib
 https://huggingface.co/spaces/VAST-AI/TripoSG/resolve/main/diso-0.1.4-cp310-cp310-linux_x86_64.whl?download=true

 ninja
 gltflib
 https://huggingface.co/spaces/VAST-AI/TripoSG/resolve/main/diso-0.1.4-cp310-cp310-linux_x86_64.whl?download=true
+onnxruntime