roman-bachmann commited on
Commit
0c0b8b9
·
1 Parent(s): b2e1f99

added optional SR

Browse files
Files changed (2) hide show
  1. app.py +23 -4
  2. requirements.txt +1 -0
app.py CHANGED
@@ -48,6 +48,13 @@ MODEL_NAME = 'FlexTok d18-d28 (DFN)'
48
  # Load FlexTok model from HF Hub
49
  flextok_model = FlexTokFromHub.from_pretrained(MODEL_ID).to(device).eval()
50
 
 
 
 
 
 
 
 
51
 
52
  def img_from_path(
53
  path: str,
@@ -71,7 +78,7 @@ def img_from_path(
71
 
72
 
73
  @spaces.GPU(duration=20)
74
- def infer(img_path, seed=0, randomize_seed=False, timesteps=20, cfg_scale=7.5, perform_norm_guidance=True):
75
  if randomize_seed:
76
  seed = None
77
 
@@ -102,6 +109,9 @@ def infer(img_path, seed=0, randomize_seed=False, timesteps=20, cfg_scale=7.5, p
102
  for reconst_k, k_keep in zip(all_reconst, K_KEEP_LIST)
103
  ]
104
 
 
 
 
105
  return all_images
106
 
107
 
@@ -143,7 +153,10 @@ with gr.Blocks(css=css, theme=gr.themes.Base()) as demo:
143
  Official demo for: <br>
144
  [**FlexTok: Resampling Images into 1D Token Sequences of Flexible Length**](https://arxiv.org/abs/2502.13967), arXiv 2025 <br>
145
 
146
- This demo uses the FlexTok tokenizer to autoencode the given RGB input, using [{MODEL_ID}](https://huggingface.co/{MODEL_ID}), running on *{power_device}*. The FlexTok encoder produces a 1D sequence of discrete tokens that are ordered in a coarse-to-fine manner. We show reconstructions from truncated subsequences, using the first 1, 2, 4, 8, ..., 256 tokens. As you will see, the first tokens capture more high-level semantic content, while subsequent ones add fine-grained detail.
 
 
 
147
  """)
148
 
149
  img_path = gr.Image(label='RGB input image', type='filepath')
@@ -151,13 +164,19 @@ with gr.Blocks(css=css, theme=gr.themes.Base()) as demo:
151
 
152
  with gr.Accordion("Advanced Settings", open=False):
153
  gr.Markdown(f"""
154
- The FlexTok decoder is a rectified flow model. The following settings control the seed of the initial noise, the number of denoising timesteps, the guidance scale, and whether to perform [Adaptive Projected Guidance](https://arxiv.org/abs/2410.02416) (we recommend enabling it).
 
 
 
 
 
155
  """)
156
  seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=1000)
157
  randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
158
  timesteps = gr.Slider(label="Denoising timesteps", minimum=1, maximum=1000, step=1, value=25)
159
  cfg_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=15.0, step=0.1, value=7.5)
160
  perform_norm_guidance = gr.Checkbox(label="Perform Adaptive Projected Guidance", value=True)
 
161
 
162
  result = gr.Gallery(
163
  label="Reconstructions", show_label=True, elem_id="gallery", type='pil',
@@ -174,7 +193,7 @@ with gr.Blocks(css=css, theme=gr.themes.Base()) as demo:
174
 
175
  run_button.click(
176
  fn = infer,
177
- inputs = [img_path, seed, randomize_seed, timesteps, cfg_scale, perform_norm_guidance],
178
  outputs = [result]
179
  )
180
 
 
48
  # Load FlexTok model from HF Hub
49
  flextok_model = FlexTokFromHub.from_pretrained(MODEL_ID).to(device).eval()
50
 
51
+ # Load AuraSR model from HF Hub
52
+ try:
53
+ from aura_sr import AuraSR
54
+ aura_sr = AuraSR.from_pretrained("fal-ai/AuraSR")
55
+ except ImportError:
56
+ aura_sr = None
57
+
58
 
59
  def img_from_path(
60
  path: str,
 
78
 
79
 
80
  @spaces.GPU(duration=20)
81
+ def infer(img_path, seed=1000, randomize_seed=False, timesteps=25, cfg_scale=7.5, perform_norm_guidance=True, super_res=True):
82
  if randomize_seed:
83
  seed = None
84
 
 
109
  for reconst_k, k_keep in zip(all_reconst, K_KEEP_LIST)
110
  ]
111
 
112
+ if super_res:
113
+ all_images = [(aura_sr.upscale_4x(img), label) for img, label in all_images]
114
+
115
  return all_images
116
 
117
 
 
153
  Official demo for: <br>
154
  [**FlexTok: Resampling Images into 1D Token Sequences of Flexible Length**](https://arxiv.org/abs/2502.13967), arXiv 2025 <br>
155
 
156
+ This demo uses the FlexTok tokenizer to autoencode the given RGB input, using [{MODEL_ID}](https://huggingface.co/{MODEL_ID}), running on *{power_device}*.
157
+ The FlexTok encoder produces a 1D sequence of discrete tokens that are ordered in a coarse-to-fine manner.
158
+ We show reconstructions from truncated subsequences, using the first 1, 2, 4, 8, ..., 256 tokens.
159
+ As you will see, the first tokens capture more high-level semantic content, while subsequent ones add fine-grained detail.
160
  """)
161
 
162
  img_path = gr.Image(label='RGB input image', type='filepath')
 
164
 
165
  with gr.Accordion("Advanced Settings", open=False):
166
  gr.Markdown(f"""
167
+ The FlexTok decoder is a rectified flow model. The following settings control the seed of the initial noise, the number of denoising timesteps,
168
+ the guidance scale, and whether to perform [Adaptive Projected Guidance](https://arxiv.org/abs/2410.02416) (we recommend enabling it).
169
+
170
+ This FlexTok model operates at 256x256 resolution. You can optionally super-resolve the reconstructions to 1024x1024 using Aura-SR for
171
+ sharper details, whithout changing the underlying reconstructed image too much. We enable it by default, but you can disable it if you would
172
+ like to see the raw 256x256 FlexTok reconstructions.
173
  """)
174
  seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=1000)
175
  randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
176
  timesteps = gr.Slider(label="Denoising timesteps", minimum=1, maximum=1000, step=1, value=25)
177
  cfg_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=15.0, step=0.1, value=7.5)
178
  perform_norm_guidance = gr.Checkbox(label="Perform Adaptive Projected Guidance", value=True)
179
+ super_res = gr.Checkbox(label="Super-resolve reconstructions from 256x256 to 1024x1024 with Aura-SR", value=True)
180
 
181
  result = gr.Gallery(
182
  label="Reconstructions", show_label=True, elem_id="gallery", type='pil',
 
193
 
194
  run_button.click(
195
  fn = infer,
196
+ inputs = [img_path, seed, randomize_seed, timesteps, cfg_scale, perform_norm_guidance, super_res],
197
  outputs = [result]
198
  )
199
 
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  flextok @ git+https://github.com/apple/ml-flextok
 
2
  spaces
 
1
  flextok @ git+https://github.com/apple/ml-flextok
2
+ aura-sr
3
  spaces