Spaces:

EPFL-VILAB
/

FlexTok

Running on Zero

App Files Files Community

roman-bachmann commited on Mar 20

Commit

0c0b8b9

1 Parent(s): b2e1f99

added optional SR

Browse files

Files changed (2) hide show

app.py +23 -4
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -48,6 +48,13 @@ MODEL_NAME = 'FlexTok d18-d28 (DFN)'
 # Load FlexTok model from HF Hub
 flextok_model = FlexTokFromHub.from_pretrained(MODEL_ID).to(device).eval()
 def img_from_path(
     path: str,
@@ -71,7 +78,7 @@ def img_from_path(
 @spaces.GPU(duration=20)
-def infer(img_path, seed=0, randomize_seed=False, timesteps=20, cfg_scale=7.5, perform_norm_guidance=True):
     if randomize_seed:
         seed = None
@@ -102,6 +109,9 @@ def infer(img_path, seed=0, randomize_seed=False, timesteps=20, cfg_scale=7.5, p
         for reconst_k, k_keep in zip(all_reconst, K_KEEP_LIST)
     ]
     return all_images
@@ -143,7 +153,10 @@ with gr.Blocks(css=css, theme=gr.themes.Base()) as demo:
                 Official demo for: <br>
                 [**FlexTok: Resampling Images into 1D Token Sequences of Flexible Length**](https://arxiv.org/abs/2502.13967), arXiv 2025 <br>
-                This demo uses the FlexTok tokenizer to autoencode the given RGB input, using [{MODEL_ID}](https://huggingface.co/{MODEL_ID}), running on *{power_device}*. The FlexTok encoder produces a 1D sequence of discrete tokens that are ordered in a coarse-to-fine manner. We show reconstructions from truncated subsequences, using the first 1, 2, 4, 8, ..., 256 tokens. As you will see, the first tokens capture more high-level semantic content, while subsequent ones add fine-grained detail.
                 """)
                 img_path = gr.Image(label='RGB input image', type='filepath')
@@ -151,13 +164,19 @@ with gr.Blocks(css=css, theme=gr.themes.Base()) as demo:
                 with gr.Accordion("Advanced Settings", open=False):
                     gr.Markdown(f"""
-                    The FlexTok decoder is a rectified flow model. The following settings control the seed of the initial noise, the number of denoising timesteps, the guidance scale, and whether to perform [Adaptive Projected Guidance](https://arxiv.org/abs/2410.02416) (we recommend enabling it).
                     """)
                     seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=1000)
                     randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
                     timesteps = gr.Slider(label="Denoising timesteps", minimum=1, maximum=1000, step=1, value=25)
                     cfg_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=15.0, step=0.1, value=7.5)
                     perform_norm_guidance = gr.Checkbox(label="Perform Adaptive Projected Guidance", value=True)
             result = gr.Gallery(
                 label="Reconstructions", show_label=True, elem_id="gallery", type='pil',
@@ -174,7 +193,7 @@ with gr.Blocks(css=css, theme=gr.themes.Base()) as demo:
     run_button.click(
         fn = infer,
-        inputs = [img_path, seed, randomize_seed, timesteps, cfg_scale, perform_norm_guidance],
         outputs = [result]
     )

 # Load FlexTok model from HF Hub
 flextok_model = FlexTokFromHub.from_pretrained(MODEL_ID).to(device).eval()
+# Load AuraSR model from HF Hub
+try:
+    from aura_sr import AuraSR
+    aura_sr = AuraSR.from_pretrained("fal-ai/AuraSR")
+except ImportError:
+    aura_sr = None
 def img_from_path(
     path: str,
 @spaces.GPU(duration=20)
+def infer(img_path, seed=1000, randomize_seed=False, timesteps=25, cfg_scale=7.5, perform_norm_guidance=True, super_res=True):
     if randomize_seed:
         seed = None
         for reconst_k, k_keep in zip(all_reconst, K_KEEP_LIST)
     ]
+    if super_res:
+        all_images = [(aura_sr.upscale_4x(img), label) for img, label in all_images]
     return all_images
                 Official demo for: <br>
                 [**FlexTok: Resampling Images into 1D Token Sequences of Flexible Length**](https://arxiv.org/abs/2502.13967), arXiv 2025 <br>
+                This demo uses the FlexTok tokenizer to autoencode the given RGB input, using [{MODEL_ID}](https://huggingface.co/{MODEL_ID}), running on *{power_device}*.
+                The FlexTok encoder produces a 1D sequence of discrete tokens that are ordered in a coarse-to-fine manner.
+                We show reconstructions from truncated subsequences, using the first 1, 2, 4, 8, ..., 256 tokens.
+                As you will see, the first tokens capture more high-level semantic content, while subsequent ones add fine-grained detail.
                 """)
                 img_path = gr.Image(label='RGB input image', type='filepath')
                 with gr.Accordion("Advanced Settings", open=False):
                     gr.Markdown(f"""
+                    The FlexTok decoder is a rectified flow model. The following settings control the seed of the initial noise, the number of denoising timesteps,
+                    the guidance scale, and whether to perform [Adaptive Projected Guidance](https://arxiv.org/abs/2410.02416) (we recommend enabling it).
+                    This FlexTok model operates at 256x256 resolution. You can optionally super-resolve the reconstructions to 1024x1024 using Aura-SR for
+                    sharper details, whithout changing the underlying reconstructed image too much. We enable it by default, but you can disable it if you would
+                    like to see the raw 256x256 FlexTok reconstructions.
                     """)
                     seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=1000)
                     randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
                     timesteps = gr.Slider(label="Denoising timesteps", minimum=1, maximum=1000, step=1, value=25)
                     cfg_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=15.0, step=0.1, value=7.5)
                     perform_norm_guidance = gr.Checkbox(label="Perform Adaptive Projected Guidance", value=True)
+                    super_res = gr.Checkbox(label="Super-resolve reconstructions from 256x256 to 1024x1024 with Aura-SR", value=True)
             result = gr.Gallery(
                 label="Reconstructions", show_label=True, elem_id="gallery", type='pil',
     run_button.click(
         fn = infer,
+        inputs = [img_path, seed, randomize_seed, timesteps, cfg_scale, perform_norm_guidance, super_res],
         outputs = [result]
     )

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 flextok @ git+https://github.com/apple/ml-flextok
 spaces

 flextok @ git+https://github.com/apple/ml-flextok
+aura-sr
 spaces